00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <string.h>
00021 #include <libaudcore/audstrings.h>
00022
00023 #include "config.h"
00024 #include "debug.h"
00025 #include "i18n.h"
00026 #include "main.h"
00027 #include "misc.h"
00028
00029 #ifdef USE_CHARDET
00030 # include <libguess.h>
00031 #endif
00032
00033 static gchar * cd_chardet_to_utf8 (const gchar * str, gssize len,
00034 gsize * arg_bytes_read, gsize * arg_bytes_write, GError ** error);
00035
00036 static gchar * str_to_utf8_fallback (const gchar * str)
00037 {
00038 gchar * out = g_strconcat (str, _(" (invalid UTF-8)"), NULL);
00039
00040 for (gchar * c = out; * c; c ++)
00041 {
00042 if (* c & 0x80)
00043 * c = '?';
00044 }
00045
00046 return out;
00047 }
00048
00049 static gchar * cd_str_to_utf8 (const gchar * str)
00050 {
00051 gchar *out_str;
00052
00053 if (str == NULL)
00054 return NULL;
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079 #ifdef USE_CHARDET
00080 if (libguess_validate_utf8(str, strlen(str)))
00081 return g_strdup(str);
00082 #else
00083 if (g_utf8_validate(str, strlen(str), NULL))
00084 return g_strdup(str);
00085 #endif
00086
00087
00088 if ((out_str = cd_chardet_to_utf8(str, strlen(str), NULL, NULL, NULL)) != NULL)
00089 return out_str;
00090
00091
00092 return str_to_utf8_fallback(str);
00093 }
00094
00095 static gchar * cd_chardet_to_utf8 (const gchar * str, gssize len,
00096 gsize * arg_bytes_read, gsize * arg_bytes_write, GError ** error)
00097 {
00098 if (error)
00099 * error = NULL;
00100
00101 gchar *ret = NULL;
00102 gsize *bytes_read, *bytes_write;
00103 gsize my_bytes_read, my_bytes_write;
00104
00105 bytes_read = arg_bytes_read != NULL ? arg_bytes_read : &my_bytes_read;
00106 bytes_write = arg_bytes_write != NULL ? arg_bytes_write : &my_bytes_write;
00107
00108 g_return_val_if_fail(str != NULL, NULL);
00109
00110 #ifdef USE_CHARDET
00111 if (libguess_validate_utf8(str, len))
00112 #else
00113 if (g_utf8_validate(str, len, NULL))
00114 #endif
00115 {
00116 if (len < 0)
00117 len = strlen (str);
00118
00119 ret = g_malloc (len + 1);
00120 memcpy (ret, str, len);
00121 ret[len] = 0;
00122
00123 if (arg_bytes_read != NULL)
00124 * arg_bytes_read = len;
00125 if (arg_bytes_write != NULL)
00126 * arg_bytes_write = len;
00127
00128 return ret;
00129 }
00130
00131 #ifdef USE_CHARDET
00132 gchar * det = get_string (NULL, "chardet_detector");
00133
00134 if (det[0])
00135 {
00136 AUDDBG("guess encoding (%s) %s\n", det, str);
00137 const gchar * encoding = libguess_determine_encoding (str, len, det);
00138 AUDDBG("encoding = %s\n", encoding);
00139 if (encoding)
00140 ret = g_convert (str, len, "UTF-8", encoding, bytes_read,
00141 bytes_write, (error && * error) ? NULL : error);
00142 }
00143
00144 g_free (det);
00145 #endif
00146
00147
00148 if (! ret)
00149 {
00150 gchar * fallbacks = get_string (NULL, "chardet_fallback");
00151 gchar * * split = g_strsplit_set (fallbacks, " ,:;|/", -1);
00152
00153 for (gchar * * enc = split; * enc; enc ++)
00154 {
00155 ret = g_convert (str, len, "UTF-8", * enc, bytes_read, bytes_write,
00156 (error && * error) ? NULL : error);
00157 if (len == *bytes_read)
00158 break;
00159 else {
00160 g_free(ret);
00161 ret = NULL;
00162 }
00163 }
00164
00165 g_strfreev (split);
00166 g_free (fallbacks);
00167 }
00168
00169
00170 if (ret == NULL)
00171 ret = g_locale_to_utf8 (str, len, bytes_read, bytes_write,
00172 (error && * error) ? NULL : error);
00173
00174
00175 if (ret == NULL)
00176 ret = g_convert (str, len, "UTF-8", "ISO-8859-1", bytes_read,
00177 bytes_write, (error && * error) ? NULL : error);
00178
00179 if (ret != NULL)
00180 {
00181 if (g_utf8_validate(ret, -1, NULL))
00182 return ret;
00183 else
00184 {
00185 g_warning("g_utf8_validate() failed for converted string in cd_chardet_to_utf8: '%s'", ret);
00186 g_free(ret);
00187 return NULL;
00188 }
00189 }
00190
00191 return NULL;
00192 }
00193
00194 void chardet_init (void)
00195 {
00196 str_set_utf8_impl (cd_str_to_utf8, cd_chardet_to_utf8);
00197 }