Libcroco
|
00001 /* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */ 00002 00003 /* 00004 * This file is part of The Croco Library 00005 * 00006 * This program is free software; you can redistribute it and/or 00007 * modify it under the terms of version 2.1 of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation. 00009 * 00010 * This program is distributed in the hope that it will be useful, 00011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 * GNU General Public License for more details. 00014 * 00015 * You should have received a copy of the GNU Lesser General Public License 00016 * along with this program; if not, write to the Free Software 00017 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 00018 * USA 00019 * 00020 * Author: Dodji Seketeli 00021 * See COPYRIGHTS file for copyright information. 00022 */ 00023 00024 #include "cr-utils.h" 00025 #include "cr-string.h" 00026 00027 /** 00028 *@file: 00029 *Some misc utility functions used 00030 *in the libcroco. 00031 *Note that troughout this file I will 00032 *refer to the CSS SPECIFICATIONS DOCUMENTATION 00033 *written by the w3c guys. You can find that document 00034 *at http://www.w3.org/TR/REC-CSS2/ . 00035 */ 00036 00037 /**************************** 00038 *Encoding transformations and 00039 *encoding helpers 00040 ****************************/ 00041 00042 /* 00043 *Here is the correspondance between the ucs-4 charactere codes 00044 *and there matching utf-8 encoding pattern as dscribed by RFC 2279: 00045 * 00046 *UCS-4 range (hex.) UTF-8 octet sequence (binary) 00047 *------------------ ----------------------------- 00048 *0000 0000-0000 007F 0xxxxxxx 00049 *0000 0080-0000 07FF 110xxxxx 10xxxxxx 00050 *0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 00051 *0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 00052 *0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 00053 *0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx 00054 */ 00055 00056 /** 00057 *Given an utf8 string buffer, calculates 00058 *the length of this string if it was encoded 00059 *in ucs4. 00060 *@param a_in_start a pointer to the begining of 00061 *the input utf8 string. 00062 *@param a_in_end a pointre to the end of the input 00063 *utf8 string (points to the last byte of the buffer) 00064 *@param a_len out parameter the calculated length. 00065 *@return CR_OK upon succesfull completion, an error code 00066 *otherwise. 00067 */ 00068 enum CRStatus 00069 cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start, 00070 const guchar * a_in_end, gulong * a_len) 00071 { 00072 guchar *byte_ptr = NULL; 00073 gint len = 0; 00074 00075 /* 00076 *to store the final decoded 00077 *unicode char 00078 */ 00079 guint c = 0; 00080 00081 g_return_val_if_fail (a_in_start && a_in_end && a_len, 00082 CR_BAD_PARAM_ERROR); 00083 *a_len = 0; 00084 00085 for (byte_ptr = (guchar *) a_in_start; 00086 byte_ptr <= a_in_end; byte_ptr++) { 00087 gint nb_bytes_2_decode = 0; 00088 00089 if (*byte_ptr <= 0x7F) { 00090 /* 00091 *7 bits long char 00092 *encoded over 1 byte: 00093 * 0xxx xxxx 00094 */ 00095 c = *byte_ptr; 00096 nb_bytes_2_decode = 1; 00097 00098 } else if ((*byte_ptr & 0xE0) == 0xC0) { 00099 /* 00100 *up to 11 bits long char. 00101 *encoded over 2 bytes: 00102 *110x xxxx 10xx xxxx 00103 */ 00104 c = *byte_ptr & 0x1F; 00105 nb_bytes_2_decode = 2; 00106 00107 } else if ((*byte_ptr & 0xF0) == 0xE0) { 00108 /* 00109 *up to 16 bit long char 00110 *encoded over 3 bytes: 00111 *1110 xxxx 10xx xxxx 10xx xxxx 00112 */ 00113 c = *byte_ptr & 0x0F; 00114 nb_bytes_2_decode = 3; 00115 00116 } else if ((*byte_ptr & 0xF8) == 0xF0) { 00117 /* 00118 *up to 21 bits long char 00119 *encoded over 4 bytes: 00120 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx 00121 */ 00122 c = *byte_ptr & 0x7; 00123 nb_bytes_2_decode = 4; 00124 00125 } else if ((*byte_ptr & 0xFC) == 0xF8) { 00126 /* 00127 *up to 26 bits long char 00128 *encoded over 5 bytes. 00129 *1111 10xx 10xx xxxx 10xx xxxx 00130 *10xx xxxx 10xx xxxx 00131 */ 00132 c = *byte_ptr & 3; 00133 nb_bytes_2_decode = 5; 00134 00135 } else if ((*byte_ptr & 0xFE) == 0xFC) { 00136 /* 00137 *up to 31 bits long char 00138 *encoded over 6 bytes: 00139 *1111 110x 10xx xxxx 10xx xxxx 00140 *10xx xxxx 10xx xxxx 10xx xxxx 00141 */ 00142 c = *byte_ptr & 1; 00143 nb_bytes_2_decode = 6; 00144 00145 } else { 00146 /* 00147 *BAD ENCODING 00148 */ 00149 return CR_ENCODING_ERROR; 00150 } 00151 00152 /* 00153 *Go and decode the remaining byte(s) 00154 *(if any) to get the current character. 00155 */ 00156 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { 00157 /*decode the next byte */ 00158 byte_ptr++; 00159 00160 /*byte pattern must be: 10xx xxxx */ 00161 if ((*byte_ptr & 0xC0) != 0x80) { 00162 return CR_ENCODING_ERROR; 00163 } 00164 00165 c = (c << 6) | (*byte_ptr & 0x3F); 00166 } 00167 00168 len++; 00169 } 00170 00171 *a_len = len; 00172 00173 return CR_OK; 00174 } 00175 00176 /** 00177 *Given an ucs4 string, this function 00178 *returns the size (in bytes) this string 00179 *would have occupied if it was encoded in utf-8. 00180 *@param a_in_start a pointer to the beginning of the input 00181 *buffer. 00182 *@param a_in_end a pointer to the end of the input buffer. 00183 *@param a_len out parameter. The computed length. 00184 *@return CR_OK upon successfull completion, an error code otherwise. 00185 */ 00186 enum CRStatus 00187 cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start, 00188 const guint32 * a_in_end, gulong * a_len) 00189 { 00190 gint len = 0; 00191 guint32 *char_ptr = NULL; 00192 00193 g_return_val_if_fail (a_in_start && a_in_end && a_len, 00194 CR_BAD_PARAM_ERROR); 00195 00196 for (char_ptr = (guint32 *) a_in_start; 00197 char_ptr <= a_in_end; char_ptr++) { 00198 if (*char_ptr <= 0x7F) { 00199 /*the utf-8 char would take 1 byte */ 00200 len += 1; 00201 } else if (*char_ptr <= 0x7FF) { 00202 /*the utf-8 char would take 2 bytes */ 00203 len += 2; 00204 } else if (*char_ptr <= 0xFFFF) { 00205 len += 3; 00206 } else if (*char_ptr <= 0x1FFFFF) { 00207 len += 4; 00208 } else if (*char_ptr <= 0x3FFFFFF) { 00209 len += 5; 00210 } else if (*char_ptr <= 0x7FFFFFFF) { 00211 len += 6; 00212 } 00213 } 00214 00215 *a_len = len; 00216 return CR_OK; 00217 } 00218 00219 /** 00220 *Given an ucsA string, this function 00221 *returns the size (in bytes) this string 00222 *would have occupied if it was encoded in utf-8. 00223 *@param a_in_start a pointer to the beginning of the input 00224 *buffer. 00225 *@param a_in_end a pointer to the end of the input buffer. 00226 *@param a_len out parameter. The computed length. 00227 *@return CR_OK upon successfull completion, an error code otherwise. 00228 */ 00229 enum CRStatus 00230 cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start, 00231 const guchar * a_in_end, gulong * a_len) 00232 { 00233 gint len = 0; 00234 guchar *char_ptr = NULL; 00235 00236 g_return_val_if_fail (a_in_start && a_in_end && a_len, 00237 CR_BAD_PARAM_ERROR); 00238 00239 for (char_ptr = (guchar *) a_in_start; 00240 char_ptr <= a_in_end; char_ptr++) { 00241 if (*char_ptr <= 0x7F) { 00242 /*the utf-8 char would take 1 byte */ 00243 len += 1; 00244 } else { 00245 /*the utf-8 char would take 2 bytes */ 00246 len += 2; 00247 } 00248 } 00249 00250 *a_len = len; 00251 return CR_OK; 00252 } 00253 00254 /** 00255 *Converts an utf8 buffer into an ucs4 buffer. 00256 * 00257 *@param a_in the input utf8 buffer to convert. 00258 *@param a_in_len in/out parameter. The size of the 00259 *input buffer to convert. After return, this parameter contains 00260 *the actual number of bytes consumed. 00261 *@param a_out the output converted ucs4 buffer. Must be allocated by 00262 *the caller. 00263 *@param a_out_len in/out parameter. The size of the output buffer. 00264 *If this size is actually smaller than the real needed size, the function 00265 *just converts what it can and returns a success status. After return, 00266 *this param points to the actual number of characters decoded. 00267 *@return CR_OK upon successfull completion, an error code otherwise. 00268 */ 00269 enum CRStatus 00270 cr_utils_utf8_to_ucs4 (const guchar * a_in, 00271 gulong * a_in_len, guint32 * a_out, gulong * a_out_len) 00272 { 00273 gulong in_len = 0, 00274 out_len = 0, 00275 in_index = 0, 00276 out_index = 0; 00277 enum CRStatus status = CR_OK; 00278 00279 /* 00280 *to store the final decoded 00281 *unicode char 00282 */ 00283 guint c = 0; 00284 00285 g_return_val_if_fail (a_in && a_in_len 00286 && a_out && a_out_len, CR_BAD_PARAM_ERROR); 00287 00288 if (*a_in_len < 1) { 00289 status = CR_OK; 00290 goto end; 00291 } 00292 00293 in_len = *a_in_len; 00294 out_len = *a_out_len; 00295 00296 for (in_index = 0, out_index = 0; 00297 (in_index < in_len) && (out_index < out_len); 00298 in_index++, out_index++) { 00299 gint nb_bytes_2_decode = 0; 00300 00301 if (a_in[in_index] <= 0x7F) { 00302 /* 00303 *7 bits long char 00304 *encoded over 1 byte: 00305 * 0xxx xxxx 00306 */ 00307 c = a_in[in_index]; 00308 nb_bytes_2_decode = 1; 00309 00310 } else if ((a_in[in_index] & 0xE0) == 0xC0) { 00311 /* 00312 *up to 11 bits long char. 00313 *encoded over 2 bytes: 00314 *110x xxxx 10xx xxxx 00315 */ 00316 c = a_in[in_index] & 0x1F; 00317 nb_bytes_2_decode = 2; 00318 00319 } else if ((a_in[in_index] & 0xF0) == 0xE0) { 00320 /* 00321 *up to 16 bit long char 00322 *encoded over 3 bytes: 00323 *1110 xxxx 10xx xxxx 10xx xxxx 00324 */ 00325 c = a_in[in_index] & 0x0F; 00326 nb_bytes_2_decode = 3; 00327 00328 } else if ((a_in[in_index] & 0xF8) == 0xF0) { 00329 /* 00330 *up to 21 bits long char 00331 *encoded over 4 bytes: 00332 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx 00333 */ 00334 c = a_in[in_index] & 0x7; 00335 nb_bytes_2_decode = 4; 00336 00337 } else if ((a_in[in_index] & 0xFC) == 0xF8) { 00338 /* 00339 *up to 26 bits long char 00340 *encoded over 5 bytes. 00341 *1111 10xx 10xx xxxx 10xx xxxx 00342 *10xx xxxx 10xx xxxx 00343 */ 00344 c = a_in[in_index] & 3; 00345 nb_bytes_2_decode = 5; 00346 00347 } else if ((a_in[in_index] & 0xFE) == 0xFC) { 00348 /* 00349 *up to 31 bits long char 00350 *encoded over 6 bytes: 00351 *1111 110x 10xx xxxx 10xx xxxx 00352 *10xx xxxx 10xx xxxx 10xx xxxx 00353 */ 00354 c = a_in[in_index] & 1; 00355 nb_bytes_2_decode = 6; 00356 00357 } else { 00358 /*BAD ENCODING */ 00359 goto end; 00360 } 00361 00362 /* 00363 *Go and decode the remaining byte(s) 00364 *(if any) to get the current character. 00365 */ 00366 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { 00367 /*decode the next byte */ 00368 in_index++; 00369 00370 /*byte pattern must be: 10xx xxxx */ 00371 if ((a_in[in_index] & 0xC0) != 0x80) { 00372 goto end; 00373 } 00374 00375 c = (c << 6) | (a_in[in_index] & 0x3F); 00376 } 00377 00378 /* 00379 *The decoded ucs4 char is now 00380 *in c. 00381 */ 00382 00383 /************************ 00384 *Some security tests 00385 ***********************/ 00386 00387 /*be sure c is a char */ 00388 if (c == 0xFFFF || c == 0xFFFE) 00389 goto end; 00390 00391 /*be sure c is inferior to the max ucs4 char value */ 00392 if (c > 0x10FFFF) 00393 goto end; 00394 00395 /* 00396 *c must be less than UTF16 "lower surrogate begin" 00397 *or higher than UTF16 "High surrogate end" 00398 */ 00399 if (c >= 0xD800 && c <= 0xDFFF) 00400 goto end; 00401 00402 /*Avoid characters that equals zero */ 00403 if (c == 0) 00404 goto end; 00405 00406 a_out[out_index] = c; 00407 } 00408 00409 end: 00410 *a_out_len = out_index + 1; 00411 *a_in_len = in_index + 1; 00412 00413 return status; 00414 } 00415 00416 /** 00417 *Reads a character from an utf8 buffer. 00418 *Actually decode the next character code (unicode character code) 00419 *and returns it. 00420 *@param a_in the starting address of the utf8 buffer. 00421 *@param a_in_len the length of the utf8 buffer. 00422 *@param a_out output parameter. The resulting read char. 00423 *@param a_consumed the number of the bytes consumed to 00424 *decode the returned character code. 00425 *@return CR_OK upon successfull completion, an error code otherwise. 00426 */ 00427 enum CRStatus 00428 cr_utils_read_char_from_utf8_buf (const guchar * a_in, 00429 gulong a_in_len, 00430 guint32 * a_out, gulong * a_consumed) 00431 { 00432 gulong in_index = 0, 00433 nb_bytes_2_decode = 0; 00434 enum CRStatus status = CR_OK; 00435 00436 /* 00437 *to store the final decoded 00438 *unicode char 00439 */ 00440 guint32 c = 0; 00441 00442 g_return_val_if_fail (a_in && a_out && a_out 00443 && a_consumed, CR_BAD_PARAM_ERROR); 00444 00445 if (a_in_len < 1) { 00446 status = CR_OK; 00447 goto end; 00448 } 00449 00450 if (*a_in <= 0x7F) { 00451 /* 00452 *7 bits long char 00453 *encoded over 1 byte: 00454 * 0xxx xxxx 00455 */ 00456 c = *a_in; 00457 nb_bytes_2_decode = 1; 00458 00459 } else if ((*a_in & 0xE0) == 0xC0) { 00460 /* 00461 *up to 11 bits long char. 00462 *encoded over 2 bytes: 00463 *110x xxxx 10xx xxxx 00464 */ 00465 c = *a_in & 0x1F; 00466 nb_bytes_2_decode = 2; 00467 00468 } else if ((*a_in & 0xF0) == 0xE0) { 00469 /* 00470 *up to 16 bit long char 00471 *encoded over 3 bytes: 00472 *1110 xxxx 10xx xxxx 10xx xxxx 00473 */ 00474 c = *a_in & 0x0F; 00475 nb_bytes_2_decode = 3; 00476 00477 } else if ((*a_in & 0xF8) == 0xF0) { 00478 /* 00479 *up to 21 bits long char 00480 *encoded over 4 bytes: 00481 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx 00482 */ 00483 c = *a_in & 0x7; 00484 nb_bytes_2_decode = 4; 00485 00486 } else if ((*a_in & 0xFC) == 0xF8) { 00487 /* 00488 *up to 26 bits long char 00489 *encoded over 5 bytes. 00490 *1111 10xx 10xx xxxx 10xx xxxx 00491 *10xx xxxx 10xx xxxx 00492 */ 00493 c = *a_in & 3; 00494 nb_bytes_2_decode = 5; 00495 00496 } else if ((*a_in & 0xFE) == 0xFC) { 00497 /* 00498 *up to 31 bits long char 00499 *encoded over 6 bytes: 00500 *1111 110x 10xx xxxx 10xx xxxx 00501 *10xx xxxx 10xx xxxx 10xx xxxx 00502 */ 00503 c = *a_in & 1; 00504 nb_bytes_2_decode = 6; 00505 00506 } else { 00507 /*BAD ENCODING */ 00508 goto end; 00509 } 00510 00511 if (nb_bytes_2_decode > a_in_len) { 00512 status = CR_END_OF_INPUT_ERROR; 00513 goto end; 00514 } 00515 00516 /* 00517 *Go and decode the remaining byte(s) 00518 *(if any) to get the current character. 00519 */ 00520 for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) { 00521 /*byte pattern must be: 10xx xxxx */ 00522 if ((a_in[in_index] & 0xC0) != 0x80) { 00523 goto end; 00524 } 00525 00526 c = (c << 6) | (a_in[in_index] & 0x3F); 00527 } 00528 00529 /* 00530 *The decoded ucs4 char is now 00531 *in c. 00532 */ 00533 00534 /************************ 00535 *Some security tests 00536 ***********************/ 00537 00538 /*be sure c is a char */ 00539 if (c == 0xFFFF || c == 0xFFFE) 00540 goto end; 00541 00542 /*be sure c is inferior to the max ucs4 char value */ 00543 if (c > 0x10FFFF) 00544 goto end; 00545 00546 /* 00547 *c must be less than UTF16 "lower surrogate begin" 00548 *or higher than UTF16 "High surrogate end" 00549 */ 00550 if (c >= 0xD800 && c <= 0xDFFF) 00551 goto end; 00552 00553 /*Avoid characters that equals zero */ 00554 if (c == 0) 00555 goto end; 00556 00557 *a_out = c; 00558 00559 end: 00560 *a_consumed = nb_bytes_2_decode; 00561 00562 return status; 00563 } 00564 00565 /** 00566 * 00567 */ 00568 enum CRStatus 00569 cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start, 00570 const guchar * a_in_end, gulong * a_len) 00571 { 00572 /* 00573 *Note: this function can be made shorter 00574 *but it considers all the cases of the utf8 encoding 00575 *to ease further extensions ... 00576 */ 00577 00578 guchar *byte_ptr = NULL; 00579 gint len = 0; 00580 00581 /* 00582 *to store the final decoded 00583 *unicode char 00584 */ 00585 guint c = 0; 00586 00587 g_return_val_if_fail (a_in_start && a_in_end && a_len, 00588 CR_BAD_PARAM_ERROR); 00589 *a_len = 0; 00590 00591 for (byte_ptr = (guchar *) a_in_start; 00592 byte_ptr <= a_in_end; byte_ptr++) { 00593 gint nb_bytes_2_decode = 0; 00594 00595 if (*byte_ptr <= 0x7F) { 00596 /* 00597 *7 bits long char 00598 *encoded over 1 byte: 00599 * 0xxx xxxx 00600 */ 00601 c = *byte_ptr; 00602 nb_bytes_2_decode = 1; 00603 00604 } else if ((*byte_ptr & 0xE0) == 0xC0) { 00605 /* 00606 *up to 11 bits long char. 00607 *encoded over 2 bytes: 00608 *110x xxxx 10xx xxxx 00609 */ 00610 c = *byte_ptr & 0x1F; 00611 nb_bytes_2_decode = 2; 00612 00613 } else if ((*byte_ptr & 0xF0) == 0xE0) { 00614 /* 00615 *up to 16 bit long char 00616 *encoded over 3 bytes: 00617 *1110 xxxx 10xx xxxx 10xx xxxx 00618 */ 00619 c = *byte_ptr & 0x0F; 00620 nb_bytes_2_decode = 3; 00621 00622 } else if ((*byte_ptr & 0xF8) == 0xF0) { 00623 /* 00624 *up to 21 bits long char 00625 *encoded over 4 bytes: 00626 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx 00627 */ 00628 c = *byte_ptr & 0x7; 00629 nb_bytes_2_decode = 4; 00630 00631 } else if ((*byte_ptr & 0xFC) == 0xF8) { 00632 /* 00633 *up to 26 bits long char 00634 *encoded over 5 bytes. 00635 *1111 10xx 10xx xxxx 10xx xxxx 00636 *10xx xxxx 10xx xxxx 00637 */ 00638 c = *byte_ptr & 3; 00639 nb_bytes_2_decode = 5; 00640 00641 } else if ((*byte_ptr & 0xFE) == 0xFC) { 00642 /* 00643 *up to 31 bits long char 00644 *encoded over 6 bytes: 00645 *1111 110x 10xx xxxx 10xx xxxx 00646 *10xx xxxx 10xx xxxx 10xx xxxx 00647 */ 00648 c = *byte_ptr & 1; 00649 nb_bytes_2_decode = 6; 00650 00651 } else { 00652 /* 00653 *BAD ENCODING 00654 */ 00655 return CR_ENCODING_ERROR; 00656 } 00657 00658 /* 00659 *Go and decode the remaining byte(s) 00660 *(if any) to get the current character. 00661 */ 00662 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { 00663 /*decode the next byte */ 00664 byte_ptr++; 00665 00666 /*byte pattern must be: 10xx xxxx */ 00667 if ((*byte_ptr & 0xC0) != 0x80) { 00668 return CR_ENCODING_ERROR; 00669 } 00670 00671 c = (c << 6) | (*byte_ptr & 0x3F); 00672 } 00673 00674 /* 00675 *The decoded ucs4 char is now 00676 *in c. 00677 */ 00678 00679 if (c <= 0xFF) { /*Add other conditions to support 00680 *other char sets (ucs2, ucs3, ucs4). 00681 */ 00682 len++; 00683 } else { 00684 /*the char is too long to fit 00685 *into the supposed charset len. 00686 */ 00687 return CR_ENCODING_ERROR; 00688 } 00689 } 00690 00691 *a_len = len; 00692 00693 return CR_OK; 00694 } 00695 00696 /** 00697 *Converts an utf8 string into an ucs4 string. 00698 *@param a_in the input string to convert. 00699 *@param a_in_len in/out parameter. The length of the input 00700 *string. After return, points to the actual number of bytes 00701 *consumed. This can be usefull to debug the input stream in case 00702 *of encoding error. 00703 *@param a_out out parameter. Points to the output string. It is allocated 00704 *by this function and must be freed by the caller. 00705 *@param a_out_len out parameter. The length of the output string. 00706 *@return CR_OK upon successfull completion, an error code otherwise. 00707 * 00708 */ 00709 enum CRStatus 00710 cr_utils_utf8_str_to_ucs4 (const guchar * a_in, 00711 gulong * a_in_len, 00712 guint32 ** a_out, gulong * a_out_len) 00713 { 00714 enum CRStatus status = CR_OK; 00715 00716 g_return_val_if_fail (a_in && a_in_len 00717 && a_out && a_out_len, CR_BAD_PARAM_ERROR); 00718 00719 status = cr_utils_utf8_str_len_as_ucs4 (a_in, 00720 &a_in[*a_in_len - 1], 00721 a_out_len); 00722 00723 g_return_val_if_fail (status == CR_OK, status); 00724 00725 *a_out = g_malloc0 (*a_out_len * sizeof (guint32)); 00726 00727 status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len); 00728 00729 return status; 00730 } 00731 00732 /** 00733 *Converts an ucs4 buffer into an utf8 buffer. 00734 * 00735 *@param a_in the input ucs4 buffer to convert. 00736 *@param a_in_len in/out parameter. The size of the 00737 *input buffer to convert. After return, this parameter contains 00738 *the actual number of characters consumed. 00739 *@param a_out the output converted utf8 buffer. Must be allocated by 00740 *the caller. 00741 *@param a_out_len in/out parameter. The size of the output buffer. 00742 *If this size is actually smaller than the real needed size, the function 00743 *just converts what it can and returns a success status. After return, 00744 *this param points to the actual number of bytes in the buffer. 00745 *@return CR_OK upon successfull completion, an error code otherwise. 00746 */ 00747 enum CRStatus 00748 cr_utils_ucs4_to_utf8 (const guint32 * a_in, 00749 gulong * a_in_len, guchar * a_out, gulong * a_out_len) 00750 { 00751 gulong in_len = 0, 00752 in_index = 0, 00753 out_index = 0; 00754 enum CRStatus status = CR_OK; 00755 00756 g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len, 00757 CR_BAD_PARAM_ERROR); 00758 00759 if (*a_in_len < 1) { 00760 status = CR_OK; 00761 goto end; 00762 } 00763 00764 in_len = *a_in_len; 00765 00766 for (in_index = 0; in_index < in_len; in_index++) { 00767 /* 00768 *FIXME: return whenever we encounter forbidden char values. 00769 */ 00770 00771 if (a_in[in_index] <= 0x7F) { 00772 a_out[out_index] = a_in[in_index]; 00773 out_index++; 00774 } else if (a_in[in_index] <= 0x7FF) { 00775 a_out[out_index] = (0xC0 | (a_in[in_index] >> 6)); 00776 a_out[out_index + 1] = 00777 (0x80 | (a_in[in_index] & 0x3F)); 00778 out_index += 2; 00779 } else if (a_in[in_index] <= 0xFFFF) { 00780 a_out[out_index] = (0xE0 | (a_in[in_index] >> 12)); 00781 a_out[out_index + 1] = 00782 (0x80 | ((a_in[in_index] >> 6) & 0x3F)); 00783 a_out[out_index + 2] = 00784 (0x80 | (a_in[in_index] & 0x3F)); 00785 out_index += 3; 00786 } else if (a_in[in_index] <= 0x1FFFFF) { 00787 a_out[out_index] = (0xF0 | (a_in[in_index] >> 18)); 00788 a_out[out_index + 1] 00789 = (0x80 | ((a_in[in_index] >> 12) & 0x3F)); 00790 a_out[out_index + 2] 00791 = (0x80 | ((a_in[in_index] >> 6) & 0x3F)); 00792 a_out[out_index + 3] 00793 = (0x80 | (a_in[in_index] & 0x3F)); 00794 out_index += 4; 00795 } else if (a_in[in_index] <= 0x3FFFFFF) { 00796 a_out[out_index] = (0xF8 | (a_in[in_index] >> 24)); 00797 a_out[out_index + 1] = 00798 (0x80 | (a_in[in_index] >> 18)); 00799 a_out[out_index + 2] 00800 = (0x80 | ((a_in[in_index] >> 12) & 0x3F)); 00801 a_out[out_index + 3] 00802 = (0x80 | ((a_in[in_index] >> 6) & 0x3F)); 00803 a_out[out_index + 4] 00804 = (0x80 | (a_in[in_index] & 0x3F)); 00805 out_index += 5; 00806 } else if (a_in[in_index] <= 0x7FFFFFFF) { 00807 a_out[out_index] = (0xFC | (a_in[in_index] >> 30)); 00808 a_out[out_index + 1] = 00809 (0x80 | (a_in[in_index] >> 24)); 00810 a_out[out_index + 2] 00811 = (0x80 | ((a_in[in_index] >> 18) & 0x3F)); 00812 a_out[out_index + 3] 00813 = (0x80 | ((a_in[in_index] >> 12) & 0x3F)); 00814 a_out[out_index + 4] 00815 = (0x80 | ((a_in[in_index] >> 6) & 0x3F)); 00816 a_out[out_index + 4] 00817 = (0x80 | (a_in[in_index] & 0x3F)); 00818 out_index += 6; 00819 } else { 00820 status = CR_ENCODING_ERROR; 00821 goto end; 00822 } 00823 } /*end for */ 00824 00825 end: 00826 *a_in_len = in_index + 1; 00827 *a_out_len = out_index + 1; 00828 00829 return status; 00830 } 00831 00832 /** 00833 *Converts an ucs4 string into an utf8 string. 00834 *@param a_in the input string to convert. 00835 *@param a_in_len in/out parameter. The length of the input 00836 *string. After return, points to the actual number of characters 00837 *consumed. This can be usefull to debug the input string in case 00838 *of encoding error. 00839 *@param a_out out parameter. Points to the output string. It is allocated 00840 *by this function and must be freed by the caller. 00841 *@param a_out_len out parameter. The length (in bytes) of the output string. 00842 *@return CR_OK upon successfull completion, an error code otherwise. 00843 */ 00844 enum CRStatus 00845 cr_utils_ucs4_str_to_utf8 (const guint32 * a_in, 00846 gulong * a_in_len, 00847 guchar ** a_out, gulong * a_out_len) 00848 { 00849 enum CRStatus status = CR_OK; 00850 00851 g_return_val_if_fail (a_in && a_in_len && a_out 00852 && a_out_len, CR_BAD_PARAM_ERROR); 00853 00854 status = cr_utils_ucs4_str_len_as_utf8 (a_in, 00855 &a_in[*a_out_len - 1], 00856 a_out_len); 00857 00858 g_return_val_if_fail (status == CR_OK, status); 00859 00860 status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len); 00861 00862 return status; 00863 } 00864 00865 /** 00866 *Converts an ucs1 buffer into an utf8 buffer. 00867 *The caller must know the size of the resulting buffer and 00868 *allocate it prior to calling this function. 00869 * 00870 *@param a_in the input ucs1 buffer. 00871 * 00872 *@param a_in_len in/out parameter. The length of the input buffer. 00873 *After return, points to the number of bytes actually consumed even 00874 *in case of encoding error. 00875 * 00876 *@param a_out out parameter. The output utf8 converted buffer. 00877 * 00878 *@param a_out_len in/out parameter. The size of the output buffer. 00879 *If the output buffer size is shorter than the actual needed size, 00880 *this function just convert what it can. 00881 * 00882 *@return CR_OK upon successfull completion, an error code otherwise. 00883 * 00884 */ 00885 enum CRStatus 00886 cr_utils_ucs1_to_utf8 (const guchar * a_in, 00887 gulong * a_in_len, guchar * a_out, gulong * a_out_len) 00888 { 00889 gulong out_index = 0, 00890 in_index = 0, 00891 in_len = 0, 00892 out_len = 0; 00893 enum CRStatus status = CR_OK; 00894 00895 g_return_val_if_fail (a_in && a_in_len 00896 && a_out_len, 00897 CR_BAD_PARAM_ERROR); 00898 00899 if (*a_in_len == 0) { 00900 *a_out_len = 0 ; 00901 return status; 00902 } 00903 g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ; 00904 00905 in_len = *a_in_len; 00906 out_len = *a_out_len; 00907 00908 for (in_index = 0, out_index = 0; 00909 (in_index < in_len) && (out_index < out_len); in_index++) { 00910 /* 00911 *FIXME: return whenever we encounter forbidden char values. 00912 */ 00913 00914 if (a_in[in_index] <= 0x7F) { 00915 a_out[out_index] = a_in[in_index]; 00916 out_index++; 00917 } else { 00918 a_out[out_index] = (0xC0 | (a_in[in_index] >> 6)); 00919 a_out[out_index + 1] = 00920 (0x80 | (a_in[in_index] & 0x3F)); 00921 out_index += 2; 00922 } 00923 } /*end for */ 00924 00925 *a_in_len = in_index; 00926 *a_out_len = out_index; 00927 00928 return status; 00929 } 00930 00931 /** 00932 *Converts an ucs1 string into an utf8 string. 00933 *@param a_in_start the beginning of the input string to convert. 00934 *@param a_in_end the end of the input string to convert. 00935 *@param a_out out parameter. The converted string. 00936 *@param a_out out parameter. The length of the converted string. 00937 *@return CR_OK upon successfull completion, an error code otherwise. 00938 * 00939 */ 00940 enum CRStatus 00941 cr_utils_ucs1_str_to_utf8 (const guchar * a_in, 00942 gulong * a_in_len, 00943 guchar ** a_out, gulong * a_out_len) 00944 { 00945 gulong out_len = 0; 00946 enum CRStatus status = CR_OK; 00947 00948 g_return_val_if_fail (a_in && a_in_len && a_out 00949 && a_out_len, CR_BAD_PARAM_ERROR); 00950 00951 if (*a_in_len < 1) { 00952 *a_out_len = 0; 00953 *a_out = NULL; 00954 return CR_OK; 00955 } 00956 00957 status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1], 00958 &out_len); 00959 00960 g_return_val_if_fail (status == CR_OK, status); 00961 00962 *a_out = g_malloc0 (out_len); 00963 00964 status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len); 00965 00966 *a_out_len = out_len; 00967 00968 return status; 00969 } 00970 00971 /** 00972 *Converts an utf8 buffer into an ucs1 buffer. 00973 *The caller must know the size of the resulting 00974 *converted buffer, and allocated it prior to calling this 00975 *function. 00976 * 00977 *@param a_in the input utf8 buffer to convert. 00978 * 00979 *@param a_in_len in/out parameter. The size of the input utf8 buffer. 00980 *After return, points to the number of bytes consumed 00981 *by the function even in case of encoding error. 00982 * 00983 *@param a_out out parameter. Points to the resulting buffer. 00984 *Must be allocated by the caller. If the size of a_out is shorter 00985 *than its required size, this function converts what it can and return 00986 *a successfull status. 00987 * 00988 *@param a_out_len in/out parameter. The size of the output buffer. 00989 *After return, points to the number of bytes consumed even in case of 00990 *encoding error. 00991 * 00992 *@return CR_OK upon successfull completion, an error code otherwise. 00993 */ 00994 enum CRStatus 00995 cr_utils_utf8_to_ucs1 (const guchar * a_in, 00996 gulong * a_in_len, guchar * a_out, gulong * a_out_len) 00997 { 00998 gulong in_index = 0, 00999 out_index = 0, 01000 in_len = 0, 01001 out_len = 0; 01002 enum CRStatus status = CR_OK; 01003 01004 /* 01005 *to store the final decoded 01006 *unicode char 01007 */ 01008 guint32 c = 0; 01009 01010 g_return_val_if_fail (a_in && a_in_len 01011 && a_out && a_out_len, CR_BAD_PARAM_ERROR); 01012 01013 if (*a_in_len < 1) { 01014 goto end; 01015 } 01016 01017 in_len = *a_in_len; 01018 out_len = *a_out_len; 01019 01020 for (in_index = 0, out_index = 0; 01021 (in_index < in_len) && (out_index < out_len); 01022 in_index++, out_index++) { 01023 gint nb_bytes_2_decode = 0; 01024 01025 if (a_in[in_index] <= 0x7F) { 01026 /* 01027 *7 bits long char 01028 *encoded over 1 byte: 01029 * 0xxx xxxx 01030 */ 01031 c = a_in[in_index]; 01032 nb_bytes_2_decode = 1; 01033 01034 } else if ((a_in[in_index] & 0xE0) == 0xC0) { 01035 /* 01036 *up to 11 bits long char. 01037 *encoded over 2 bytes: 01038 *110x xxxx 10xx xxxx 01039 */ 01040 c = a_in[in_index] & 0x1F; 01041 nb_bytes_2_decode = 2; 01042 01043 } else if ((a_in[in_index] & 0xF0) == 0xE0) { 01044 /* 01045 *up to 16 bit long char 01046 *encoded over 3 bytes: 01047 *1110 xxxx 10xx xxxx 10xx xxxx 01048 */ 01049 c = a_in[in_index] & 0x0F; 01050 nb_bytes_2_decode = 3; 01051 01052 } else if ((a_in[in_index] & 0xF8) == 0xF0) { 01053 /* 01054 *up to 21 bits long char 01055 *encoded over 4 bytes: 01056 *1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx 01057 */ 01058 c = a_in[in_index] & 0x7; 01059 nb_bytes_2_decode = 4; 01060 01061 } else if ((a_in[in_index] & 0xFC) == 0xF8) { 01062 /* 01063 *up to 26 bits long char 01064 *encoded over 5 bytes. 01065 *1111 10xx 10xx xxxx 10xx xxxx 01066 *10xx xxxx 10xx xxxx 01067 */ 01068 c = a_in[in_index] & 3; 01069 nb_bytes_2_decode = 5; 01070 01071 } else if ((a_in[in_index] & 0xFE) == 0xFC) { 01072 /* 01073 *up to 31 bits long char 01074 *encoded over 6 bytes: 01075 *1111 110x 10xx xxxx 10xx xxxx 01076 *10xx xxxx 10xx xxxx 10xx xxxx 01077 */ 01078 c = a_in[in_index] & 1; 01079 nb_bytes_2_decode = 6; 01080 01081 } else { 01082 /*BAD ENCODING */ 01083 status = CR_ENCODING_ERROR; 01084 goto end; 01085 } 01086 01087 /* 01088 *Go and decode the remaining byte(s) 01089 *(if any) to get the current character. 01090 */ 01091 if (in_index + nb_bytes_2_decode - 1 >= in_len) { 01092 goto end; 01093 } 01094 01095 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) { 01096 /*decode the next byte */ 01097 in_index++; 01098 01099 /*byte pattern must be: 10xx xxxx */ 01100 if ((a_in[in_index] & 0xC0) != 0x80) { 01101 status = CR_ENCODING_ERROR; 01102 goto end; 01103 } 01104 01105 c = (c << 6) | (a_in[in_index] & 0x3F); 01106 } 01107 01108 /* 01109 *The decoded ucs4 char is now 01110 *in c. 01111 */ 01112 01113 if (c > 0xFF) { 01114 status = CR_ENCODING_ERROR; 01115 goto end; 01116 } 01117 01118 a_out[out_index] = c; 01119 } 01120 01121 end: 01122 *a_out_len = out_index; 01123 *a_in_len = in_index; 01124 01125 return status; 01126 } 01127 01128 /** 01129 *Converts an utf8 buffer into an 01130 *ucs1 buffer. 01131 *@param a_in_start the start of the input buffer. 01132 *@param a_in_end the end of the input buffer. 01133 *@param a_out out parameter. The resulting converted ucs4 buffer. 01134 *Must be freed by the caller. 01135 *@param a_out_len out parameter. The length of the converted buffer. 01136 *@return CR_OK upon successfull completion, an error code otherwise. 01137 *Note that out parameters are valid if and only if this function 01138 *returns CR_OK. 01139 */ 01140 enum CRStatus 01141 cr_utils_utf8_str_to_ucs1 (const guchar * a_in, 01142 gulong * a_in_len, 01143 guchar ** a_out, gulong * a_out_len) 01144 { 01145 enum CRStatus status = CR_OK; 01146 01147 g_return_val_if_fail (a_in && a_in_len 01148 && a_out && a_out_len, CR_BAD_PARAM_ERROR); 01149 01150 if (*a_in_len < 1) { 01151 *a_out_len = 0; 01152 *a_out = NULL; 01153 return CR_OK; 01154 } 01155 01156 status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1], 01157 a_out_len); 01158 01159 g_return_val_if_fail (status == CR_OK, status); 01160 01161 *a_out = g_malloc0 (*a_out_len * sizeof (guint32)); 01162 01163 status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len); 01164 return status; 01165 } 01166 01167 /***************************************** 01168 *CSS basic types identification utilities 01169 *****************************************/ 01170 01171 /** 01172 *Returns TRUE if a_char is a white space as 01173 *defined in the css spec in chap 4.1.1. 01174 * 01175 *white-space ::= ' '| \t|\r|\n|\f 01176 * 01177 *@param a_char the character to test. 01178 *return TRUE if is a white space, false otherwise. 01179 */ 01180 gboolean 01181 cr_utils_is_white_space (guint32 a_char) 01182 { 01183 switch (a_char) { 01184 case ' ': 01185 case '\t': 01186 case '\r': 01187 case '\n': 01188 case '\f': 01189 return TRUE; 01190 break; 01191 default: 01192 return FALSE; 01193 } 01194 } 01195 01196 /** 01197 *Returns true if the character is a newline 01198 *as defined in the css spec in the chap 4.1.1. 01199 * 01200 *nl ::= \n|\r\n|\r|\f 01201 * 01202 *@param a_char the character to test. 01203 *@return TRUE if the character is a newline, FALSE otherwise. 01204 */ 01205 gboolean 01206 cr_utils_is_newline (guint32 a_char) 01207 { 01208 switch (a_char) { 01209 case '\n': 01210 case '\r': 01211 case '\f': 01212 return TRUE; 01213 break; 01214 default: 01215 return FALSE; 01216 } 01217 } 01218 01219 /** 01220 *returns TRUE if the char is part of an hexa num char: 01221 *i.e hexa_char ::= [0-9A-F] 01222 */ 01223 gboolean 01224 cr_utils_is_hexa_char (guint32 a_char) 01225 { 01226 if ((a_char >= '0' && a_char <= '9') 01227 || (a_char >= 'A' && a_char <= 'F')) { 01228 return TRUE; 01229 } 01230 return FALSE; 01231 } 01232 01233 /** 01234 *Returns true if the character is a nonascii 01235 *character (as defined in the css spec chap 4.1.1): 01236 * 01237 *nonascii ::= [^\0-\177] 01238 * 01239 *@param a_char the character to test. 01240 *@return TRUE if the character is a nonascii char, 01241 *FALSE otherwise. 01242 */ 01243 gboolean 01244 cr_utils_is_nonascii (guint32 a_char) 01245 { 01246 if (a_char <= 177) { 01247 return FALSE; 01248 } 01249 01250 return TRUE; 01251 } 01252 01253 /** 01254 *Dumps a character a_nb times on a file. 01255 *@param a_char the char to dump 01256 *@param a_fp the destination file pointer 01257 *@param a_nb the number of times a_char is to be dumped. 01258 */ 01259 void 01260 cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb) 01261 { 01262 glong i = 0; 01263 01264 for (i = 0; i < a_nb; i++) { 01265 fprintf (a_fp, "%c", a_char); 01266 } 01267 } 01268 01269 void 01270 cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb) 01271 { 01272 glong i = 0; 01273 01274 g_return_if_fail (a_string); 01275 01276 for (i = 0; i < a_nb; i++) { 01277 g_string_append_printf (a_string, "%c", a_char); 01278 } 01279 } 01280 01281 /** 01282 *Duplicates a list of GString instances. 01283 *@return the duplicated list of GString instances or NULL if 01284 *something bad happened. 01285 *@param a_list_of_strings the list of strings to be duplicated. 01286 */ 01287 GList * 01288 cr_utils_dup_glist_of_string (GList const * a_list_of_strings) 01289 { 01290 GList const *cur = NULL; 01291 GList *result = NULL; 01292 01293 g_return_val_if_fail (a_list_of_strings, NULL); 01294 01295 for (cur = a_list_of_strings; cur; cur = cur->next) { 01296 GString *str = NULL; 01297 01298 str = g_string_new_len (((GString *) cur->data)->str, 01299 ((GString *) cur->data)->len); 01300 if (str) 01301 result = g_list_append (result, str); 01302 } 01303 01304 return result; 01305 } 01306 01307 /** 01308 *Duplicate a GList where the GList::data is a CRString. 01309 *@param a_list_of_strings the list to duplicate 01310 *@return the duplicated list, or NULL if something bad 01311 *happened. 01312 */ 01313 GList * 01314 cr_utils_dup_glist_of_cr_string (GList const * a_list_of_strings) 01315 { 01316 GList const *cur = NULL; 01317 GList *result = NULL; 01318 01319 g_return_val_if_fail (a_list_of_strings, NULL); 01320 01321 for (cur = a_list_of_strings; cur; cur = cur->next) { 01322 CRString *str = NULL; 01323 01324 str = cr_string_dup ((CRString const *) cur->data) ; 01325 if (str) 01326 result = g_list_append (result, str); 01327 } 01328 01329 return result; 01330 }