Libcroco
cr-utils.c
Go to the documentation of this file.
00001 /* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */
00002 
00003 /*
00004  * This file is part of The Croco Library
00005  *
00006  * This program is free software; you can redistribute it and/or
00007  * modify it under the terms of version 2.1 of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation.
00009  *
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  *
00015  * You should have received a copy of the GNU Lesser General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00018  * USA
00019  *
00020  * Author: Dodji Seketeli
00021  * See COPYRIGHTS file for copyright information.
00022  */
00023 
00024 #include "cr-utils.h"
00025 #include "cr-string.h"
00026 
00027 /**
00028  *@file:
00029  *Some misc utility functions used
00030  *in the libcroco.
00031  *Note that troughout this file I will
00032  *refer to the CSS SPECIFICATIONS DOCUMENTATION
00033  *written by the w3c guys. You can find that document
00034  *at http://www.w3.org/TR/REC-CSS2/ .
00035  */
00036 
00037 /****************************
00038  *Encoding transformations and
00039  *encoding helpers
00040  ****************************/
00041 
00042 /*
00043  *Here is the correspondance between the ucs-4 charactere codes
00044  *and there matching utf-8 encoding pattern as dscribed by RFC 2279:
00045  *
00046  *UCS-4 range (hex.)    UTF-8 octet sequence (binary)
00047  *------------------    -----------------------------
00048  *0000 0000-0000 007F   0xxxxxxx
00049  *0000 0080-0000 07FF   110xxxxx 10xxxxxx
00050  *0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
00051  *0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
00052  *0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
00053  *0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx
00054  */
00055 
00056 /**
00057  *Given an utf8 string buffer, calculates
00058  *the length of this string if it was encoded
00059  *in ucs4.
00060  *@param a_in_start a pointer to the begining of
00061  *the input utf8 string.
00062  *@param a_in_end a pointre to the end of the input
00063  *utf8 string (points to the last byte of the buffer)
00064  *@param a_len out parameter the calculated length.
00065  *@return CR_OK upon succesfull completion, an error code
00066  *otherwise.
00067  */
00068 enum CRStatus
00069 cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start,
00070                                const guchar * a_in_end, gulong * a_len)
00071 {
00072         guchar *byte_ptr = NULL;
00073         gint len = 0;
00074 
00075         /*
00076          *to store the final decoded 
00077          *unicode char
00078          */
00079         guint c = 0;
00080 
00081         g_return_val_if_fail (a_in_start && a_in_end && a_len,
00082                               CR_BAD_PARAM_ERROR);
00083         *a_len = 0;
00084 
00085         for (byte_ptr = (guchar *) a_in_start;
00086              byte_ptr <= a_in_end; byte_ptr++) {
00087                 gint nb_bytes_2_decode = 0;
00088 
00089                 if (*byte_ptr <= 0x7F) {
00090                         /*
00091                          *7 bits long char
00092                          *encoded over 1 byte:
00093                          * 0xxx xxxx
00094                          */
00095                         c = *byte_ptr;
00096                         nb_bytes_2_decode = 1;
00097 
00098                 } else if ((*byte_ptr & 0xE0) == 0xC0) {
00099                         /*
00100                          *up to 11 bits long char.
00101                          *encoded over 2 bytes:
00102                          *110x xxxx  10xx xxxx
00103                          */
00104                         c = *byte_ptr & 0x1F;
00105                         nb_bytes_2_decode = 2;
00106 
00107                 } else if ((*byte_ptr & 0xF0) == 0xE0) {
00108                         /*
00109                          *up to 16 bit long char
00110                          *encoded over 3 bytes:
00111                          *1110 xxxx  10xx xxxx  10xx xxxx
00112                          */
00113                         c = *byte_ptr & 0x0F;
00114                         nb_bytes_2_decode = 3;
00115 
00116                 } else if ((*byte_ptr & 0xF8) == 0xF0) {
00117                         /*
00118                          *up to 21 bits long char
00119                          *encoded over 4 bytes:
00120                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
00121                          */
00122                         c = *byte_ptr & 0x7;
00123                         nb_bytes_2_decode = 4;
00124 
00125                 } else if ((*byte_ptr & 0xFC) == 0xF8) {
00126                         /*
00127                          *up to 26 bits long char
00128                          *encoded over 5 bytes.
00129                          *1111 10xx  10xx xxxx  10xx xxxx  
00130                          *10xx xxxx  10xx xxxx
00131                          */
00132                         c = *byte_ptr & 3;
00133                         nb_bytes_2_decode = 5;
00134 
00135                 } else if ((*byte_ptr & 0xFE) == 0xFC) {
00136                         /*
00137                          *up to 31 bits long char
00138                          *encoded over 6 bytes:
00139                          *1111 110x  10xx xxxx  10xx xxxx  
00140                          *10xx xxxx  10xx xxxx  10xx xxxx
00141                          */
00142                         c = *byte_ptr & 1;
00143                         nb_bytes_2_decode = 6;
00144 
00145                 } else {
00146                         /*
00147                          *BAD ENCODING
00148                          */
00149                         return CR_ENCODING_ERROR;
00150                 }
00151 
00152                 /*
00153                  *Go and decode the remaining byte(s)
00154                  *(if any) to get the current character.
00155                  */
00156                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
00157                         /*decode the next byte */
00158                         byte_ptr++;
00159 
00160                         /*byte pattern must be: 10xx xxxx */
00161                         if ((*byte_ptr & 0xC0) != 0x80) {
00162                                 return CR_ENCODING_ERROR;
00163                         }
00164 
00165                         c = (c << 6) | (*byte_ptr & 0x3F);
00166                 }
00167 
00168                 len++;
00169         }
00170 
00171         *a_len = len;
00172 
00173         return CR_OK;
00174 }
00175 
00176 /**
00177  *Given an ucs4 string, this function
00178  *returns the size (in bytes) this string
00179  *would have occupied if it was encoded in utf-8.
00180  *@param a_in_start a pointer to the beginning of the input
00181  *buffer.
00182  *@param a_in_end a pointer to the end of the input buffer.
00183  *@param a_len out parameter. The computed length.
00184  *@return CR_OK upon successfull completion, an error code otherwise.
00185  */
00186 enum CRStatus
00187 cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start,
00188                                const guint32 * a_in_end, gulong * a_len)
00189 {
00190         gint len = 0;
00191         guint32 *char_ptr = NULL;
00192 
00193         g_return_val_if_fail (a_in_start && a_in_end && a_len,
00194                               CR_BAD_PARAM_ERROR);
00195 
00196         for (char_ptr = (guint32 *) a_in_start;
00197              char_ptr <= a_in_end; char_ptr++) {
00198                 if (*char_ptr <= 0x7F) {
00199                         /*the utf-8 char would take 1 byte */
00200                         len += 1;
00201                 } else if (*char_ptr <= 0x7FF) {
00202                         /*the utf-8 char would take 2 bytes */
00203                         len += 2;
00204                 } else if (*char_ptr <= 0xFFFF) {
00205                         len += 3;
00206                 } else if (*char_ptr <= 0x1FFFFF) {
00207                         len += 4;
00208                 } else if (*char_ptr <= 0x3FFFFFF) {
00209                         len += 5;
00210                 } else if (*char_ptr <= 0x7FFFFFFF) {
00211                         len += 6;
00212                 }
00213         }
00214 
00215         *a_len = len;
00216         return CR_OK;
00217 }
00218 
00219 /**
00220  *Given an ucsA string, this function
00221  *returns the size (in bytes) this string
00222  *would have occupied if it was encoded in utf-8.
00223  *@param a_in_start a pointer to the beginning of the input
00224  *buffer.
00225  *@param a_in_end a pointer to the end of the input buffer.
00226  *@param a_len out parameter. The computed length.
00227  *@return CR_OK upon successfull completion, an error code otherwise.
00228  */
00229 enum CRStatus
00230 cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start,
00231                                const guchar * a_in_end, gulong * a_len)
00232 {
00233         gint len = 0;
00234         guchar *char_ptr = NULL;
00235 
00236         g_return_val_if_fail (a_in_start && a_in_end && a_len,
00237                               CR_BAD_PARAM_ERROR);
00238 
00239         for (char_ptr = (guchar *) a_in_start;
00240              char_ptr <= a_in_end; char_ptr++) {
00241                 if (*char_ptr <= 0x7F) {
00242                         /*the utf-8 char would take 1 byte */
00243                         len += 1;
00244                 } else {
00245                         /*the utf-8 char would take 2 bytes */
00246                         len += 2;
00247                 }
00248         }
00249 
00250         *a_len = len;
00251         return CR_OK;
00252 }
00253 
00254 /**
00255  *Converts an utf8 buffer into an ucs4 buffer.
00256  *
00257  *@param a_in the input utf8 buffer to convert.
00258  *@param a_in_len in/out parameter. The size of the
00259  *input buffer to convert. After return, this parameter contains
00260  *the actual number of bytes consumed.
00261  *@param a_out the output converted ucs4 buffer. Must be allocated by
00262  *the caller.
00263  *@param a_out_len in/out parameter. The size of the output buffer.
00264  *If this size is actually smaller than the real needed size, the function
00265  *just converts what it can and returns a success status. After return,
00266  *this param points to the actual number of characters decoded.
00267  *@return CR_OK upon successfull completion, an error code otherwise.
00268  */
00269 enum CRStatus
00270 cr_utils_utf8_to_ucs4 (const guchar * a_in,
00271                        gulong * a_in_len, guint32 * a_out, gulong * a_out_len)
00272 {
00273         gulong in_len = 0,
00274                 out_len = 0,
00275                 in_index = 0,
00276                 out_index = 0;
00277         enum CRStatus status = CR_OK;
00278 
00279         /*
00280          *to store the final decoded 
00281          *unicode char
00282          */
00283         guint c = 0;
00284 
00285         g_return_val_if_fail (a_in && a_in_len
00286                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
00287 
00288         if (*a_in_len < 1) {
00289                 status = CR_OK;
00290                 goto end;
00291         }
00292 
00293         in_len = *a_in_len;
00294         out_len = *a_out_len;
00295 
00296         for (in_index = 0, out_index = 0;
00297              (in_index < in_len) && (out_index < out_len);
00298              in_index++, out_index++) {
00299                 gint nb_bytes_2_decode = 0;
00300 
00301                 if (a_in[in_index] <= 0x7F) {
00302                         /*
00303                          *7 bits long char
00304                          *encoded over 1 byte:
00305                          * 0xxx xxxx
00306                          */
00307                         c = a_in[in_index];
00308                         nb_bytes_2_decode = 1;
00309 
00310                 } else if ((a_in[in_index] & 0xE0) == 0xC0) {
00311                         /*
00312                          *up to 11 bits long char.
00313                          *encoded over 2 bytes:
00314                          *110x xxxx  10xx xxxx
00315                          */
00316                         c = a_in[in_index] & 0x1F;
00317                         nb_bytes_2_decode = 2;
00318 
00319                 } else if ((a_in[in_index] & 0xF0) == 0xE0) {
00320                         /*
00321                          *up to 16 bit long char
00322                          *encoded over 3 bytes:
00323                          *1110 xxxx  10xx xxxx  10xx xxxx
00324                          */
00325                         c = a_in[in_index] & 0x0F;
00326                         nb_bytes_2_decode = 3;
00327 
00328                 } else if ((a_in[in_index] & 0xF8) == 0xF0) {
00329                         /*
00330                          *up to 21 bits long char
00331                          *encoded over 4 bytes:
00332                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
00333                          */
00334                         c = a_in[in_index] & 0x7;
00335                         nb_bytes_2_decode = 4;
00336 
00337                 } else if ((a_in[in_index] & 0xFC) == 0xF8) {
00338                         /*
00339                          *up to 26 bits long char
00340                          *encoded over 5 bytes.
00341                          *1111 10xx  10xx xxxx  10xx xxxx  
00342                          *10xx xxxx  10xx xxxx
00343                          */
00344                         c = a_in[in_index] & 3;
00345                         nb_bytes_2_decode = 5;
00346 
00347                 } else if ((a_in[in_index] & 0xFE) == 0xFC) {
00348                         /*
00349                          *up to 31 bits long char
00350                          *encoded over 6 bytes:
00351                          *1111 110x  10xx xxxx  10xx xxxx  
00352                          *10xx xxxx  10xx xxxx  10xx xxxx
00353                          */
00354                         c = a_in[in_index] & 1;
00355                         nb_bytes_2_decode = 6;
00356 
00357                 } else {
00358                         /*BAD ENCODING */
00359                         goto end;
00360                 }
00361 
00362                 /*
00363                  *Go and decode the remaining byte(s)
00364                  *(if any) to get the current character.
00365                  */
00366                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
00367                         /*decode the next byte */
00368                         in_index++;
00369 
00370                         /*byte pattern must be: 10xx xxxx */
00371                         if ((a_in[in_index] & 0xC0) != 0x80) {
00372                                 goto end;
00373                         }
00374 
00375                         c = (c << 6) | (a_in[in_index] & 0x3F);
00376                 }
00377 
00378                 /*
00379                  *The decoded ucs4 char is now
00380                  *in c.
00381                  */
00382 
00383                 /************************
00384                  *Some security tests
00385                  ***********************/
00386 
00387                 /*be sure c is a char */
00388                 if (c == 0xFFFF || c == 0xFFFE)
00389                         goto end;
00390 
00391                 /*be sure c is inferior to the max ucs4 char value */
00392                 if (c > 0x10FFFF)
00393                         goto end;
00394 
00395                 /*
00396                  *c must be less than UTF16 "lower surrogate begin"
00397                  *or higher than UTF16 "High surrogate end"
00398                  */
00399                 if (c >= 0xD800 && c <= 0xDFFF)
00400                         goto end;
00401 
00402                 /*Avoid characters that equals zero */
00403                 if (c == 0)
00404                         goto end;
00405 
00406                 a_out[out_index] = c;
00407         }
00408 
00409       end:
00410         *a_out_len = out_index + 1;
00411         *a_in_len = in_index + 1;
00412 
00413         return status;
00414 }
00415 
00416 /**
00417  *Reads a character from an utf8 buffer.
00418  *Actually decode the next character code (unicode character code)
00419  *and returns it.
00420  *@param a_in the starting address of the utf8 buffer.
00421  *@param a_in_len the length of the utf8 buffer.
00422  *@param a_out output parameter. The resulting read char.
00423  *@param a_consumed the number of the bytes consumed to
00424  *decode the returned character code.
00425  *@return CR_OK upon successfull completion, an error code otherwise.
00426  */
00427 enum CRStatus
00428 cr_utils_read_char_from_utf8_buf (const guchar * a_in,
00429                                   gulong a_in_len,
00430                                   guint32 * a_out, gulong * a_consumed)
00431 {
00432         gulong in_index = 0,
00433                nb_bytes_2_decode = 0;
00434         enum CRStatus status = CR_OK;
00435 
00436         /*
00437          *to store the final decoded 
00438          *unicode char
00439          */
00440         guint32 c = 0;
00441 
00442         g_return_val_if_fail (a_in && a_out && a_out
00443                               && a_consumed, CR_BAD_PARAM_ERROR);
00444 
00445         if (a_in_len < 1) {
00446                 status = CR_OK;
00447                 goto end;
00448         }
00449 
00450         if (*a_in <= 0x7F) {
00451                 /*
00452                  *7 bits long char
00453                  *encoded over 1 byte:
00454                  * 0xxx xxxx
00455                  */
00456                 c = *a_in;
00457                 nb_bytes_2_decode = 1;
00458 
00459         } else if ((*a_in & 0xE0) == 0xC0) {
00460                 /*
00461                  *up to 11 bits long char.
00462                  *encoded over 2 bytes:
00463                  *110x xxxx  10xx xxxx
00464                  */
00465                 c = *a_in & 0x1F;
00466                 nb_bytes_2_decode = 2;
00467 
00468         } else if ((*a_in & 0xF0) == 0xE0) {
00469                 /*
00470                  *up to 16 bit long char
00471                  *encoded over 3 bytes:
00472                  *1110 xxxx  10xx xxxx  10xx xxxx
00473                  */
00474                 c = *a_in & 0x0F;
00475                 nb_bytes_2_decode = 3;
00476 
00477         } else if ((*a_in & 0xF8) == 0xF0) {
00478                 /*
00479                  *up to 21 bits long char
00480                  *encoded over 4 bytes:
00481                  *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
00482                  */
00483                 c = *a_in & 0x7;
00484                 nb_bytes_2_decode = 4;
00485 
00486         } else if ((*a_in & 0xFC) == 0xF8) {
00487                 /*
00488                  *up to 26 bits long char
00489                  *encoded over 5 bytes.
00490                  *1111 10xx  10xx xxxx  10xx xxxx  
00491                  *10xx xxxx  10xx xxxx
00492                  */
00493                 c = *a_in & 3;
00494                 nb_bytes_2_decode = 5;
00495 
00496         } else if ((*a_in & 0xFE) == 0xFC) {
00497                 /*
00498                  *up to 31 bits long char
00499                  *encoded over 6 bytes:
00500                  *1111 110x  10xx xxxx  10xx xxxx  
00501                  *10xx xxxx  10xx xxxx  10xx xxxx
00502                  */
00503                 c = *a_in & 1;
00504                 nb_bytes_2_decode = 6;
00505 
00506         } else {
00507                 /*BAD ENCODING */
00508                 goto end;
00509         }
00510 
00511         if (nb_bytes_2_decode > a_in_len) {
00512                 status = CR_END_OF_INPUT_ERROR;
00513                 goto end;
00514         }
00515 
00516         /*
00517          *Go and decode the remaining byte(s)
00518          *(if any) to get the current character.
00519          */
00520         for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) {
00521                 /*byte pattern must be: 10xx xxxx */
00522                 if ((a_in[in_index] & 0xC0) != 0x80) {
00523                         goto end;
00524                 }
00525 
00526                 c = (c << 6) | (a_in[in_index] & 0x3F);
00527         }
00528 
00529         /*
00530          *The decoded ucs4 char is now
00531          *in c.
00532          */
00533 
00534     /************************
00535      *Some security tests
00536      ***********************/
00537 
00538         /*be sure c is a char */
00539         if (c == 0xFFFF || c == 0xFFFE)
00540                 goto end;
00541 
00542         /*be sure c is inferior to the max ucs4 char value */
00543         if (c > 0x10FFFF)
00544                 goto end;
00545 
00546         /*
00547          *c must be less than UTF16 "lower surrogate begin"
00548          *or higher than UTF16 "High surrogate end"
00549          */
00550         if (c >= 0xD800 && c <= 0xDFFF)
00551                 goto end;
00552 
00553         /*Avoid characters that equals zero */
00554         if (c == 0)
00555                 goto end;
00556 
00557         *a_out = c;
00558 
00559       end:
00560         *a_consumed = nb_bytes_2_decode;
00561 
00562         return status;
00563 }
00564 
00565 /**
00566  *
00567  */
00568 enum CRStatus
00569 cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start,
00570                                const guchar * a_in_end, gulong * a_len)
00571 {
00572         /*
00573          *Note: this function can be made shorter
00574          *but it considers all the cases of the utf8 encoding
00575          *to ease further extensions ...
00576          */
00577 
00578         guchar *byte_ptr = NULL;
00579         gint len = 0;
00580 
00581         /*
00582          *to store the final decoded 
00583          *unicode char
00584          */
00585         guint c = 0;
00586 
00587         g_return_val_if_fail (a_in_start && a_in_end && a_len,
00588                               CR_BAD_PARAM_ERROR);
00589         *a_len = 0;
00590 
00591         for (byte_ptr = (guchar *) a_in_start;
00592              byte_ptr <= a_in_end; byte_ptr++) {
00593                 gint nb_bytes_2_decode = 0;
00594 
00595                 if (*byte_ptr <= 0x7F) {
00596                         /*
00597                          *7 bits long char
00598                          *encoded over 1 byte:
00599                          * 0xxx xxxx
00600                          */
00601                         c = *byte_ptr;
00602                         nb_bytes_2_decode = 1;
00603 
00604                 } else if ((*byte_ptr & 0xE0) == 0xC0) {
00605                         /*
00606                          *up to 11 bits long char.
00607                          *encoded over 2 bytes:
00608                          *110x xxxx  10xx xxxx
00609                          */
00610                         c = *byte_ptr & 0x1F;
00611                         nb_bytes_2_decode = 2;
00612 
00613                 } else if ((*byte_ptr & 0xF0) == 0xE0) {
00614                         /*
00615                          *up to 16 bit long char
00616                          *encoded over 3 bytes:
00617                          *1110 xxxx  10xx xxxx  10xx xxxx
00618                          */
00619                         c = *byte_ptr & 0x0F;
00620                         nb_bytes_2_decode = 3;
00621 
00622                 } else if ((*byte_ptr & 0xF8) == 0xF0) {
00623                         /*
00624                          *up to 21 bits long char
00625                          *encoded over 4 bytes:
00626                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
00627                          */
00628                         c = *byte_ptr & 0x7;
00629                         nb_bytes_2_decode = 4;
00630 
00631                 } else if ((*byte_ptr & 0xFC) == 0xF8) {
00632                         /*
00633                          *up to 26 bits long char
00634                          *encoded over 5 bytes.
00635                          *1111 10xx  10xx xxxx  10xx xxxx  
00636                          *10xx xxxx  10xx xxxx
00637                          */
00638                         c = *byte_ptr & 3;
00639                         nb_bytes_2_decode = 5;
00640 
00641                 } else if ((*byte_ptr & 0xFE) == 0xFC) {
00642                         /*
00643                          *up to 31 bits long char
00644                          *encoded over 6 bytes:
00645                          *1111 110x  10xx xxxx  10xx xxxx  
00646                          *10xx xxxx  10xx xxxx  10xx xxxx
00647                          */
00648                         c = *byte_ptr & 1;
00649                         nb_bytes_2_decode = 6;
00650 
00651                 } else {
00652                         /*
00653                          *BAD ENCODING
00654                          */
00655                         return CR_ENCODING_ERROR;
00656                 }
00657 
00658                 /*
00659                  *Go and decode the remaining byte(s)
00660                  *(if any) to get the current character.
00661                  */
00662                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
00663                         /*decode the next byte */
00664                         byte_ptr++;
00665 
00666                         /*byte pattern must be: 10xx xxxx */
00667                         if ((*byte_ptr & 0xC0) != 0x80) {
00668                                 return CR_ENCODING_ERROR;
00669                         }
00670 
00671                         c = (c << 6) | (*byte_ptr & 0x3F);
00672                 }
00673 
00674                 /*
00675                  *The decoded ucs4 char is now
00676                  *in c.
00677                  */
00678 
00679                 if (c <= 0xFF) { /*Add other conditions to support
00680                                   *other char sets (ucs2, ucs3, ucs4).
00681                                   */
00682                         len++;
00683                 } else {
00684                         /*the char is too long to fit
00685                          *into the supposed charset len.
00686                          */
00687                         return CR_ENCODING_ERROR;
00688                 }
00689         }
00690 
00691         *a_len = len;
00692 
00693         return CR_OK;
00694 }
00695 
00696 /**
00697  *Converts an utf8 string into an ucs4 string.
00698  *@param a_in the input string to convert.
00699  *@param a_in_len in/out parameter. The length of the input
00700  *string. After return, points to the actual number of bytes
00701  *consumed. This can be usefull to debug the input stream in case
00702  *of encoding error.
00703  *@param a_out out parameter. Points to the output string. It is allocated 
00704  *by this function and must be freed by the caller.
00705  *@param a_out_len out parameter. The length of the output string.
00706  *@return CR_OK upon successfull completion, an error code otherwise.
00707  *
00708  */
00709 enum CRStatus
00710 cr_utils_utf8_str_to_ucs4 (const guchar * a_in,
00711                            gulong * a_in_len,
00712                            guint32 ** a_out, gulong * a_out_len)
00713 {
00714         enum CRStatus status = CR_OK;
00715 
00716         g_return_val_if_fail (a_in && a_in_len
00717                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
00718 
00719         status = cr_utils_utf8_str_len_as_ucs4 (a_in,
00720                                                 &a_in[*a_in_len - 1],
00721                                                 a_out_len);
00722 
00723         g_return_val_if_fail (status == CR_OK, status);
00724 
00725         *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
00726 
00727         status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len);
00728 
00729         return status;
00730 }
00731 
00732 /**
00733  *Converts an ucs4 buffer into an utf8 buffer.
00734  *
00735  *@param a_in the input ucs4 buffer to convert.
00736  *@param a_in_len in/out parameter. The size of the
00737  *input buffer to convert. After return, this parameter contains
00738  *the actual number of characters consumed.
00739  *@param a_out the output converted utf8 buffer. Must be allocated by
00740  *the caller.
00741  *@param a_out_len in/out parameter. The size of the output buffer.
00742  *If this size is actually smaller than the real needed size, the function
00743  *just converts what it can and returns a success status. After return,
00744  *this param points to the actual number of bytes in the buffer.
00745  *@return CR_OK upon successfull completion, an error code otherwise.
00746  */
00747 enum CRStatus
00748 cr_utils_ucs4_to_utf8 (const guint32 * a_in,
00749                        gulong * a_in_len, guchar * a_out, gulong * a_out_len)
00750 {
00751         gulong in_len = 0,
00752                 in_index = 0,
00753                 out_index = 0;
00754         enum CRStatus status = CR_OK;
00755 
00756         g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len,
00757                               CR_BAD_PARAM_ERROR);
00758 
00759         if (*a_in_len < 1) {
00760                 status = CR_OK;
00761                 goto end;
00762         }
00763 
00764         in_len = *a_in_len;
00765 
00766         for (in_index = 0; in_index < in_len; in_index++) {
00767                 /*
00768                  *FIXME: return whenever we encounter forbidden char values.
00769                  */
00770 
00771                 if (a_in[in_index] <= 0x7F) {
00772                         a_out[out_index] = a_in[in_index];
00773                         out_index++;
00774                 } else if (a_in[in_index] <= 0x7FF) {
00775                         a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
00776                         a_out[out_index + 1] =
00777                                 (0x80 | (a_in[in_index] & 0x3F));
00778                         out_index += 2;
00779                 } else if (a_in[in_index] <= 0xFFFF) {
00780                         a_out[out_index] = (0xE0 | (a_in[in_index] >> 12));
00781                         a_out[out_index + 1] =
00782                                 (0x80 | ((a_in[in_index] >> 6) & 0x3F));
00783                         a_out[out_index + 2] =
00784                                 (0x80 | (a_in[in_index] & 0x3F));
00785                         out_index += 3;
00786                 } else if (a_in[in_index] <= 0x1FFFFF) {
00787                         a_out[out_index] = (0xF0 | (a_in[in_index] >> 18));
00788                         a_out[out_index + 1]
00789                                 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
00790                         a_out[out_index + 2]
00791                                 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
00792                         a_out[out_index + 3]
00793                                 = (0x80 | (a_in[in_index] & 0x3F));
00794                         out_index += 4;
00795                 } else if (a_in[in_index] <= 0x3FFFFFF) {
00796                         a_out[out_index] = (0xF8 | (a_in[in_index] >> 24));
00797                         a_out[out_index + 1] =
00798                                 (0x80 | (a_in[in_index] >> 18));
00799                         a_out[out_index + 2]
00800                                 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
00801                         a_out[out_index + 3]
00802                                 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
00803                         a_out[out_index + 4]
00804                                 = (0x80 | (a_in[in_index] & 0x3F));
00805                         out_index += 5;
00806                 } else if (a_in[in_index] <= 0x7FFFFFFF) {
00807                         a_out[out_index] = (0xFC | (a_in[in_index] >> 30));
00808                         a_out[out_index + 1] =
00809                                 (0x80 | (a_in[in_index] >> 24));
00810                         a_out[out_index + 2]
00811                                 = (0x80 | ((a_in[in_index] >> 18) & 0x3F));
00812                         a_out[out_index + 3]
00813                                 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
00814                         a_out[out_index + 4]
00815                                 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
00816                         a_out[out_index + 4]
00817                                 = (0x80 | (a_in[in_index] & 0x3F));
00818                         out_index += 6;
00819                 } else {
00820                         status = CR_ENCODING_ERROR;
00821                         goto end;
00822                 }
00823         }                       /*end for */
00824 
00825       end:
00826         *a_in_len = in_index + 1;
00827         *a_out_len = out_index + 1;
00828 
00829         return status;
00830 }
00831 
00832 /**
00833  *Converts an ucs4 string into an utf8 string.
00834  *@param a_in the input string to convert.
00835  *@param a_in_len in/out parameter. The length of the input
00836  *string. After return, points to the actual number of characters
00837  *consumed. This can be usefull to debug the input string in case
00838  *of encoding error.
00839  *@param a_out out parameter. Points to the output string. It is allocated 
00840  *by this function and must be freed by the caller.
00841  *@param a_out_len out parameter. The length (in bytes) of the output string.
00842  *@return CR_OK upon successfull completion, an error code otherwise.
00843  */
00844 enum CRStatus
00845 cr_utils_ucs4_str_to_utf8 (const guint32 * a_in,
00846                            gulong * a_in_len,
00847                            guchar ** a_out, gulong * a_out_len)
00848 {
00849         enum CRStatus status = CR_OK;
00850 
00851         g_return_val_if_fail (a_in && a_in_len && a_out
00852                               && a_out_len, CR_BAD_PARAM_ERROR);
00853 
00854         status = cr_utils_ucs4_str_len_as_utf8 (a_in,
00855                                                 &a_in[*a_out_len - 1],
00856                                                 a_out_len);
00857 
00858         g_return_val_if_fail (status == CR_OK, status);
00859 
00860         status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len);
00861 
00862         return status;
00863 }
00864 
00865 /**
00866  *Converts an ucs1 buffer into an utf8 buffer.
00867  *The caller must know the size of the resulting buffer and
00868  *allocate it prior to calling this function.
00869  *
00870  *@param a_in the input ucs1 buffer.
00871  *
00872  *@param a_in_len in/out parameter. The length of the input buffer.
00873  *After return, points to the number of bytes actually consumed even
00874  *in case of encoding error.
00875  *
00876  *@param a_out out parameter. The output utf8 converted buffer.
00877  *
00878  *@param a_out_len in/out parameter. The size of the output buffer.
00879  *If the output buffer size is shorter than the actual needed size, 
00880  *this function just convert what it can.
00881  *
00882  *@return CR_OK upon successfull completion, an error code otherwise.
00883  *
00884  */
00885 enum CRStatus
00886 cr_utils_ucs1_to_utf8 (const guchar * a_in,
00887                        gulong * a_in_len, guchar * a_out, gulong * a_out_len)
00888 {
00889         gulong out_index = 0,
00890                 in_index = 0,
00891                 in_len = 0,
00892                 out_len = 0;
00893         enum CRStatus status = CR_OK;
00894 
00895         g_return_val_if_fail (a_in && a_in_len
00896                               && a_out_len, 
00897                               CR_BAD_PARAM_ERROR);
00898 
00899         if (*a_in_len == 0) {
00900                 *a_out_len = 0 ;
00901                 return status;
00902         }
00903         g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ;
00904 
00905         in_len = *a_in_len;
00906         out_len = *a_out_len;
00907 
00908         for (in_index = 0, out_index = 0;
00909              (in_index < in_len) && (out_index < out_len); in_index++) {
00910                 /*
00911                  *FIXME: return whenever we encounter forbidden char values.
00912                  */
00913 
00914                 if (a_in[in_index] <= 0x7F) {
00915                         a_out[out_index] = a_in[in_index];
00916                         out_index++;
00917                 } else {
00918                         a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
00919                         a_out[out_index + 1] =
00920                                 (0x80 | (a_in[in_index] & 0x3F));
00921                         out_index += 2;
00922                 }
00923         }                       /*end for */
00924 
00925         *a_in_len = in_index;
00926         *a_out_len = out_index;
00927 
00928         return status;
00929 }
00930 
00931 /**
00932  *Converts an ucs1 string into an utf8 string.
00933  *@param a_in_start the beginning of the input string to convert.
00934  *@param a_in_end the end of the input string to convert.
00935  *@param a_out out parameter. The converted string.
00936  *@param a_out out parameter. The length of the converted string.
00937  *@return CR_OK upon successfull completion, an error code otherwise.
00938  *
00939  */
00940 enum CRStatus
00941 cr_utils_ucs1_str_to_utf8 (const guchar * a_in,
00942                            gulong * a_in_len,
00943                            guchar ** a_out, gulong * a_out_len)
00944 {
00945         gulong out_len = 0;
00946         enum CRStatus status = CR_OK;
00947 
00948         g_return_val_if_fail (a_in && a_in_len && a_out
00949                               && a_out_len, CR_BAD_PARAM_ERROR);
00950 
00951         if (*a_in_len < 1) {
00952                 *a_out_len = 0;
00953                 *a_out = NULL;
00954                 return CR_OK;
00955         }
00956 
00957         status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1],
00958                                                 &out_len);
00959 
00960         g_return_val_if_fail (status == CR_OK, status);
00961 
00962         *a_out = g_malloc0 (out_len);
00963 
00964         status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len);
00965 
00966         *a_out_len = out_len;
00967 
00968         return status;
00969 }
00970 
00971 /**
00972  *Converts an utf8 buffer into an ucs1 buffer.
00973  *The caller must know the size of the resulting
00974  *converted buffer, and allocated it prior to calling this
00975  *function.
00976  *
00977  *@param a_in the input utf8 buffer to convert.
00978  *
00979  *@param a_in_len in/out parameter. The size of the input utf8 buffer.
00980  *After return, points to the number of bytes consumed
00981  *by the function even in case of encoding error.
00982  *
00983  *@param a_out out parameter. Points to the resulting buffer.
00984  *Must be allocated by the caller. If the size of a_out is shorter
00985  *than its required size, this function converts what it can and return
00986  *a successfull status.
00987  *
00988  *@param a_out_len in/out parameter. The size of the output buffer.
00989  *After return, points to the number of bytes consumed even in case of
00990  *encoding error.
00991  *
00992  *@return CR_OK upon successfull completion, an error code otherwise.
00993  */
00994 enum CRStatus
00995 cr_utils_utf8_to_ucs1 (const guchar * a_in,
00996                        gulong * a_in_len, guchar * a_out, gulong * a_out_len)
00997 {
00998         gulong in_index = 0,
00999                 out_index = 0,
01000                 in_len = 0,
01001                 out_len = 0;
01002         enum CRStatus status = CR_OK;
01003 
01004         /*
01005          *to store the final decoded 
01006          *unicode char
01007          */
01008         guint32 c = 0;
01009 
01010         g_return_val_if_fail (a_in && a_in_len
01011                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
01012 
01013         if (*a_in_len < 1) {
01014                 goto end;
01015         }
01016 
01017         in_len = *a_in_len;
01018         out_len = *a_out_len;
01019 
01020         for (in_index = 0, out_index = 0;
01021              (in_index < in_len) && (out_index < out_len);
01022              in_index++, out_index++) {
01023                 gint nb_bytes_2_decode = 0;
01024 
01025                 if (a_in[in_index] <= 0x7F) {
01026                         /*
01027                          *7 bits long char
01028                          *encoded over 1 byte:
01029                          * 0xxx xxxx
01030                          */
01031                         c = a_in[in_index];
01032                         nb_bytes_2_decode = 1;
01033 
01034                 } else if ((a_in[in_index] & 0xE0) == 0xC0) {
01035                         /*
01036                          *up to 11 bits long char.
01037                          *encoded over 2 bytes:
01038                          *110x xxxx  10xx xxxx
01039                          */
01040                         c = a_in[in_index] & 0x1F;
01041                         nb_bytes_2_decode = 2;
01042 
01043                 } else if ((a_in[in_index] & 0xF0) == 0xE0) {
01044                         /*
01045                          *up to 16 bit long char
01046                          *encoded over 3 bytes:
01047                          *1110 xxxx  10xx xxxx  10xx xxxx
01048                          */
01049                         c = a_in[in_index] & 0x0F;
01050                         nb_bytes_2_decode = 3;
01051 
01052                 } else if ((a_in[in_index] & 0xF8) == 0xF0) {
01053                         /*
01054                          *up to 21 bits long char
01055                          *encoded over 4 bytes:
01056                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
01057                          */
01058                         c = a_in[in_index] & 0x7;
01059                         nb_bytes_2_decode = 4;
01060 
01061                 } else if ((a_in[in_index] & 0xFC) == 0xF8) {
01062                         /*
01063                          *up to 26 bits long char
01064                          *encoded over 5 bytes.
01065                          *1111 10xx  10xx xxxx  10xx xxxx  
01066                          *10xx xxxx  10xx xxxx
01067                          */
01068                         c = a_in[in_index] & 3;
01069                         nb_bytes_2_decode = 5;
01070 
01071                 } else if ((a_in[in_index] & 0xFE) == 0xFC) {
01072                         /*
01073                          *up to 31 bits long char
01074                          *encoded over 6 bytes:
01075                          *1111 110x  10xx xxxx  10xx xxxx  
01076                          *10xx xxxx  10xx xxxx  10xx xxxx
01077                          */
01078                         c = a_in[in_index] & 1;
01079                         nb_bytes_2_decode = 6;
01080 
01081                 } else {
01082                         /*BAD ENCODING */
01083                         status = CR_ENCODING_ERROR;
01084                         goto end;
01085                 }
01086 
01087                 /*
01088                  *Go and decode the remaining byte(s)
01089                  *(if any) to get the current character.
01090                  */
01091                 if (in_index + nb_bytes_2_decode - 1 >= in_len) {
01092                         goto end;
01093                 }
01094 
01095                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
01096                         /*decode the next byte */
01097                         in_index++;
01098 
01099                         /*byte pattern must be: 10xx xxxx */
01100                         if ((a_in[in_index] & 0xC0) != 0x80) {
01101                                 status = CR_ENCODING_ERROR;
01102                                 goto end;
01103                         }
01104 
01105                         c = (c << 6) | (a_in[in_index] & 0x3F);
01106                 }
01107 
01108                 /*
01109                  *The decoded ucs4 char is now
01110                  *in c.
01111                  */
01112 
01113                 if (c > 0xFF) {
01114                         status = CR_ENCODING_ERROR;
01115                         goto end;
01116                 }
01117 
01118                 a_out[out_index] = c;
01119         }
01120 
01121       end:
01122         *a_out_len = out_index;
01123         *a_in_len = in_index;
01124 
01125         return status;
01126 }
01127 
01128 /**
01129  *Converts an utf8 buffer into an
01130  *ucs1 buffer.
01131  *@param a_in_start the start of the input buffer.
01132  *@param a_in_end the end of the input buffer.
01133  *@param a_out out parameter. The resulting converted ucs4 buffer.
01134  *Must be freed by the caller.
01135  *@param a_out_len out parameter. The length of the converted buffer.
01136  *@return CR_OK upon successfull completion, an error code otherwise.
01137  *Note that out parameters are valid if and only if this function
01138  *returns CR_OK.
01139  */
01140 enum CRStatus
01141 cr_utils_utf8_str_to_ucs1 (const guchar * a_in,
01142                            gulong * a_in_len,
01143                            guchar ** a_out, gulong * a_out_len)
01144 {
01145         enum CRStatus status = CR_OK;
01146 
01147         g_return_val_if_fail (a_in && a_in_len
01148                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
01149 
01150         if (*a_in_len < 1) {
01151                 *a_out_len = 0;
01152                 *a_out = NULL;
01153                 return CR_OK;
01154         }
01155 
01156         status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1],
01157                                                 a_out_len);
01158 
01159         g_return_val_if_fail (status == CR_OK, status);
01160 
01161         *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
01162 
01163         status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len);
01164         return status;
01165 }
01166 
01167 /*****************************************
01168  *CSS basic types identification utilities
01169  *****************************************/
01170 
01171 /**
01172  *Returns TRUE if a_char is a white space as
01173  *defined in the css spec in chap 4.1.1.
01174  *
01175  *white-space ::= ' '| \t|\r|\n|\f
01176  *
01177  *@param a_char the character to test.
01178  *return TRUE if is a white space, false otherwise.
01179  */
01180 gboolean
01181 cr_utils_is_white_space (guint32 a_char)
01182 {
01183         switch (a_char) {
01184         case ' ':
01185         case '\t':
01186         case '\r':
01187         case '\n':
01188         case '\f':
01189                 return TRUE;
01190                 break;
01191         default:
01192                 return FALSE;
01193         }
01194 }
01195 
01196 /**
01197  *Returns true if the character is a newline
01198  *as defined in the css spec in the chap 4.1.1.
01199  *
01200  *nl ::= \n|\r\n|\r|\f
01201  *
01202  *@param a_char the character to test.
01203  *@return TRUE if the character is a newline, FALSE otherwise.
01204  */
01205 gboolean
01206 cr_utils_is_newline (guint32 a_char)
01207 {
01208         switch (a_char) {
01209         case '\n':
01210         case '\r':
01211         case '\f':
01212                 return TRUE;
01213                 break;
01214         default:
01215                 return FALSE;
01216         }
01217 }
01218 
01219 /**
01220  *returns TRUE if the char is part of an hexa num char:
01221  *i.e hexa_char ::= [0-9A-F]
01222  */
01223 gboolean
01224 cr_utils_is_hexa_char (guint32 a_char)
01225 {
01226         if ((a_char >= '0' && a_char <= '9')
01227             || (a_char >= 'A' && a_char <= 'F')) {
01228                 return TRUE;
01229         }
01230         return FALSE;
01231 }
01232 
01233 /**
01234  *Returns true if the character is a nonascii
01235  *character (as defined in the css spec chap 4.1.1):
01236  *
01237  *nonascii ::= [^\0-\177]
01238  *
01239  *@param a_char the character to test.
01240  *@return TRUE if the character is a nonascii char,
01241  *FALSE otherwise.
01242  */
01243 gboolean
01244 cr_utils_is_nonascii (guint32 a_char)
01245 {
01246         if (a_char <= 177) {
01247                 return FALSE;
01248         }
01249 
01250         return TRUE;
01251 }
01252 
01253 /**
01254  *Dumps a character a_nb times on a file.
01255  *@param a_char the char to dump
01256  *@param a_fp the destination file pointer
01257  *@param a_nb the number of times a_char is to be dumped.
01258  */
01259 void
01260 cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb)
01261 {
01262         glong i = 0;
01263 
01264         for (i = 0; i < a_nb; i++) {
01265                 fprintf (a_fp, "%c", a_char);
01266         }
01267 }
01268 
01269 void
01270 cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb)
01271 {
01272         glong i = 0;
01273 
01274         g_return_if_fail (a_string);
01275 
01276         for (i = 0; i < a_nb; i++) {
01277                 g_string_append_printf (a_string, "%c", a_char);
01278         }
01279 }
01280 
01281 /**
01282  *Duplicates a list of GString instances.
01283  *@return the duplicated list of GString instances or NULL if
01284  *something bad happened.
01285  *@param a_list_of_strings the list of strings to be duplicated.
01286  */
01287 GList *
01288 cr_utils_dup_glist_of_string (GList const * a_list_of_strings)
01289 {
01290         GList const *cur = NULL;
01291         GList *result = NULL;
01292 
01293         g_return_val_if_fail (a_list_of_strings, NULL);
01294 
01295         for (cur = a_list_of_strings; cur; cur = cur->next) {
01296                 GString *str = NULL;
01297 
01298                 str = g_string_new_len (((GString *) cur->data)->str,
01299                                         ((GString *) cur->data)->len);
01300                 if (str)
01301                         result = g_list_append (result, str);
01302         }
01303 
01304         return result;
01305 }
01306 
01307 /**
01308  *Duplicate a GList where the GList::data is a CRString.
01309  *@param a_list_of_strings the list to duplicate
01310  *@return the duplicated list, or NULL if something bad
01311  *happened.
01312  */
01313 GList *
01314 cr_utils_dup_glist_of_cr_string (GList const * a_list_of_strings)
01315 {
01316         GList const *cur = NULL;
01317         GList *result = NULL;
01318 
01319         g_return_val_if_fail (a_list_of_strings, NULL);
01320 
01321         for (cur = a_list_of_strings; cur; cur = cur->next) {
01322                 CRString *str = NULL;
01323 
01324                 str = cr_string_dup ((CRString const *) cur->data) ;
01325                 if (str)
01326                         result = g_list_append (result, str);
01327         }
01328 
01329         return result;
01330 }