00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #undef DECODE_DEBUG
00027
00028
00029 #include <assert.h>
00030
00031 #include "decoder.h"
00032 using namespace khtml;
00033
00034 #include "htmlhashes.h"
00035
00036 #include <qregexp.h>
00037 #include <qtextcodec.h>
00038
00039 #include <kglobal.h>
00040 #include <kcharsets.h>
00041
00042 #include <ctype.h>
00043 #include <kdebug.h>
00044 #include <klocale.h>
00045
00046 class KanjiCode
00047 {
00048 public:
00049 enum Type {ASCII, JIS, EUC, SJIS, UNICODE, UTF8 };
00050 static enum Type judge(const char *str);
00051 static const int ESC;
00052 static const int _SS2_;
00053 static const unsigned char kanji_map_sjis[];
00054 static int ISkanji(int code)
00055 {
00056 if (code >= 0x100)
00057 return 0;
00058 return (kanji_map_sjis[code & 0xff] & 1);
00059 }
00060
00061 static int ISkana(int code)
00062 {
00063 if (code >= 0x100)
00064 return 0;
00065 return (kanji_map_sjis[code & 0xff] & 2);
00066 }
00067
00068 };
00069
00070 const int KanjiCode::ESC = 0x1b;
00071 const int KanjiCode::_SS2_ = 0x8e;
00072
00073 const unsigned char KanjiCode::kanji_map_sjis[] =
00074 {
00075 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00076 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00077 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00078 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00079 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00080 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00081 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00082 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00083 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00084 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00085 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00086 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00087 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00088 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00089 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00090 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
00091 };
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118 enum KanjiCode::Type KanjiCode::judge(const char *str)
00119 {
00120 enum Type code;
00121 int i;
00122 int bfr = FALSE;
00123 int bfk = 0;
00124 int sjis = 0;
00125 int euc = 0;
00126
00127 const unsigned char *ptr = (const unsigned char *) str;
00128 int size = strlen(str);
00129
00130 code = ASCII;
00131
00132 i = 0;
00133 while (i < size) {
00134 if (ptr[i] == ESC && (size - i >= 3)) {
00135 if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
00136 || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
00137 code = JIS;
00138 goto breakBreak;
00139 } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
00140 || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
00141 code = JIS;
00142 goto breakBreak;
00143 } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
00144 code = JIS;
00145 i += 3;
00146 } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
00147 code = JIS;
00148 i += 3;
00149 } else {
00150 i++;
00151 }
00152 bfr = FALSE;
00153 bfk = 0;
00154 } else {
00155 if (ptr[i] < 0x20) {
00156 bfr = FALSE;
00157 bfk = 0;
00158
00159 if ((i >= 2) && (ptr[i - 2] == 0x81)
00160 && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
00161 code = SJIS;
00162 sjis += 100;
00163 } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
00164 && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
00165 code = EUC;
00166 euc += 100;
00167 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
00168 sjis += 40;
00169 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
00170 euc += 40;
00171 }
00172 } else {
00173
00174 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
00175 sjis++;
00176 } else if ((size - i > 1) && (ptr[i] == 0x83)
00177 && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
00178 sjis++;
00179 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
00180 euc++;
00181 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
00182 euc++;
00183 }
00184 if (bfr) {
00185 if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
00186 code = SJIS;
00187 goto breakBreak;
00188 } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
00189 code = SJIS;
00190 goto breakBreak;
00191 } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
00192 code = EUC;
00193 goto breakBreak;
00194 } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
00195 code = EUC;
00196 goto breakBreak;
00197 } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
00198 code = SJIS;
00199 goto breakBreak;
00200 } else if (ptr[i] <= 0x7f) {
00201 code = SJIS;
00202 goto breakBreak;
00203 } else {
00204 if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
00205 euc++;
00206 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
00207 ;
00208 } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
00209 euc++;
00210 } else if (0x8e == ptr[i]) {
00211 euc++;
00212 } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
00213 sjis++;
00214 }
00215 bfr = FALSE;
00216 bfk = 0;
00217 }
00218 } else if (0x8e == ptr[i]) {
00219 if (size - i <= 1) {
00220 ;
00221 } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
00222
00223 if (bfk == 1) {
00224 euc += 100;
00225 }
00226 bfk++;
00227 i++;
00228 } else {
00229
00230 code = SJIS;
00231 goto breakBreak;
00232 }
00233 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
00234
00235 code = SJIS;
00236 if ((size - i >= 1)
00237 && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
00238 || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
00239 goto breakBreak;
00240 }
00241 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
00242
00243 code = EUC;
00244 if ((size - i >= 1)
00245 && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
00246 goto breakBreak;
00247 }
00248 } else if (ptr[i] <= 0x7f) {
00249 ;
00250 } else {
00251 bfr = TRUE;
00252 bfk = 0;
00253 }
00254 }
00255 i++;
00256 }
00257 }
00258 if (code == ASCII) {
00259 if (sjis > euc) {
00260 code = SJIS;
00261 } else if (sjis < euc) {
00262 code = EUC;
00263 }
00264 }
00265 breakBreak:
00266 return (code);
00267 }
00268
00269 Decoder::Decoder()
00270 {
00271
00272 m_codec = QTextCodec::codecForMib(4);
00273 m_decoder = m_codec->makeDecoder();
00274 enc = 0;
00275 body = false;
00276 beginning = true;
00277 visualRTL = false;
00278 haveEncoding = false;
00279 }
00280 Decoder::~Decoder()
00281 {
00282 delete m_decoder;
00283 }
00284
00285 void Decoder::setEncoding(const char *_encoding, bool force)
00286 {
00287 #ifdef DECODE_DEBUG
00288 kdDebug(6005) << "setEncoding " << _encoding << " " << force << endl;
00289 #endif
00290 enc = _encoding;
00291
00292 QTextCodec *old = m_codec;
00293 #ifdef DECODE_DEBUG
00294 kdDebug(6005) << "old encoding is:" << m_codec->name() << endl;
00295 #endif
00296 enc = enc.lower();
00297 #ifdef DECODE_DEBUG
00298 kdDebug(6005) << "requesting:" << enc << endl;
00299 #endif
00300 if(enc.isNull() || enc.isEmpty())
00301 return;
00302 if(enc == "visual")
00303 enc = "iso8859-8";
00304 bool b;
00305 m_codec = KGlobal::charsets()->codecForName(enc, b);
00306 if(m_codec->mibEnum() == 11) {
00307
00308 m_codec = QTextCodec::codecForName("iso8859-8-i");
00309 visualRTL = true;
00310 }
00311 if( !b )
00312 m_codec = old;
00313 else
00314 haveEncoding = force;
00315 delete m_decoder;
00316 m_decoder = m_codec->makeDecoder();
00317 if (m_codec->mibEnum() == 1000)
00318 haveEncoding = false;
00319 #ifdef DECODE_DEBUG
00320 kdDebug(6005) << "Decoder::encoding used is" << m_codec->name() << endl;
00321 #endif
00322 }
00323
00324 const char *Decoder::encoding() const
00325 {
00326 return enc;
00327 }
00328
00329 QString Decoder::decode(const char *data, int len)
00330 {
00331
00332
00333
00334 if(!haveEncoding && !body) {
00335 #ifdef DECODE_DEBUG
00336 kdDebug(6005) << "looking for charset definition" << endl;
00337 #endif
00338
00339 uchar * uchars = (uchar *) data;
00340 if( uchars[0] == 0xfe && uchars[1] == 0xff ||
00341 uchars[0] == 0xff && uchars[1] == 0xfe ) {
00342 enc = "ISO-10646-UCS-2";
00343 haveEncoding = true;
00344 m_codec = QTextCodec::codecForMib(1000);
00345 delete m_decoder;
00346 m_decoder = m_codec->makeDecoder();
00347 } else {
00348
00349 if(m_codec->mibEnum() != 1000) {
00350
00351 char *d = const_cast<char *>(data);
00352 int i = len - 1;
00353 while(i >= 0) {
00354 if(d[i] == 0) d[i] = ' ';
00355 i--;
00356 }
00357 }
00358 buffer += QCString(data, len+1);
00359
00360
00361
00362
00363
00364 const char *ptr = buffer.data();
00365 while(*ptr != '\0')
00366 {
00367 if(*ptr == '<') {
00368 bool end = false;
00369 ptr++;
00370 if(*ptr == '/') ptr++, end=true;
00371 char tmp[20];
00372 int len = 0;
00373 while (
00374 ((*ptr >= 'a') && (*ptr <= 'z') ||
00375 (*ptr >= 'A') && (*ptr <= 'Z') ||
00376 (*ptr >= '0') && (*ptr <= '9'))
00377 && len < 19 )
00378 {
00379 tmp[len] = tolower( *ptr );
00380 ptr++;
00381 len++;
00382 }
00383 tmp[len] = 0;
00384 int id = khtml::getTagID(tmp, len);
00385 if(end) id += ID_CLOSE_TAG;
00386
00387 switch( id ) {
00388 case ID_META:
00389 {
00390
00391
00392 const char * end = ptr;
00393 while(*end != '>' && *end != '\0') end++;
00394 if ( *end == '\0' ) break;
00395 QCString str( ptr, (end-ptr)+1);
00396 str = str.lower();
00397 int pos = 0;
00398
00399
00400 while( pos < ( int ) str.length() ) {
00401 if( (pos = str.find("charset", pos)) == -1) break;
00402 pos += 7;
00403
00404 while( pos < (int)str.length() && str[pos] <= ' ' ) pos++;
00405 if ( pos == ( int )str.length()) break;
00406 if ( str[pos++] != '=' ) continue;
00407 while ( pos < ( int )str.length() &&
00408 ( str[pos] <= ' ' ) || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'')
00409 pos++;
00410
00411
00412 if ( pos == ( int )str.length() ) break;
00413 uint endpos = pos;
00414 while( endpos < str.length() &&
00415 (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
00416 && str[endpos] != ';' && str[endpos] != '>') )
00417 endpos++;
00418 enc = str.mid(pos, endpos-pos);
00419 #ifdef DECODE_DEBUG
00420 kdDebug( 6005 ) << "Decoder: found charset: " << enc.data() << endl;
00421 #endif
00422 setEncoding(enc, true);
00423 if( haveEncoding ) goto found;
00424
00425 if ( endpos >= str.length() || str[endpos] == '/' || str[endpos] == '>' ) break;
00426
00427 pos = endpos + 1;
00428 }
00429 }
00430 case ID_SCRIPT:
00431 case (ID_SCRIPT+ID_CLOSE_TAG):
00432 case ID_NOSCRIPT:
00433 case (ID_NOSCRIPT+ID_CLOSE_TAG):
00434 case ID_STYLE:
00435 case (ID_STYLE+ID_CLOSE_TAG):
00436 case ID_LINK:
00437 case (ID_LINK+ID_CLOSE_TAG):
00438 case ID_OBJECT:
00439 case (ID_OBJECT+ID_CLOSE_TAG):
00440 case ID_TITLE:
00441 case (ID_TITLE+ID_CLOSE_TAG):
00442 case ID_BASE:
00443 case (ID_BASE+ID_CLOSE_TAG):
00444 case ID_HTML:
00445 case ID_HEAD:
00446 case 0:
00447 case (0 + ID_CLOSE_TAG ):
00448 break;
00449 default:
00450 body = true;
00451 #ifdef DECODE_DEBUG
00452 kdDebug( 6005 ) << "Decoder: no charset found. Id=" << id << endl;
00453 #endif
00454 goto found;
00455 }
00456 }
00457 else
00458 ptr++;
00459 }
00460 return QString::null;
00461 }
00462 }
00463
00464 found:
00465 if (!haveEncoding && KGlobal::locale()->languageList()[0] == "ja") {
00466 #ifdef DECODE_DEBUG
00467 kdDebug( 6005 ) << "Decoder: use auto-detect (" << strlen(data) << ")" << endl;
00468 #endif
00469 switch ( KanjiCode::judge( data ) ) {
00470 case KanjiCode::JIS:
00471 enc = "jis7";
00472 break;
00473 case KanjiCode::EUC:
00474 enc = "eucjp";
00475 break;
00476 case KanjiCode::SJIS:
00477 enc = "sjis";
00478 break;
00479 default:
00480 enc = NULL;
00481 break;
00482 }
00483 #ifdef DECODE_DEBUG
00484 kdDebug( 6005 ) << "Decoder: auto detect encoding is " << enc << endl;
00485 #endif
00486 if (!enc.isEmpty()) {
00487 setEncoding(enc, true);
00488 }
00489 }
00490
00491
00492
00493 if (!m_codec)
00494 {
00495 if(enc.isEmpty()) enc = "iso8859-1";
00496 m_codec = QTextCodec::codecForName(enc);
00497
00498 if(!m_codec) {
00499 m_codec = QTextCodec::codecForMib(4);
00500 enc = "iso8859-1";
00501 }
00502 delete m_decoder;
00503 m_decoder = m_codec->makeDecoder();
00504 }
00505 QString out;
00506
00507 if(!buffer.isEmpty() && enc != "ISO-10646-UCS-2") {
00508 out = m_decoder->toUnicode(buffer, buffer.length());
00509 buffer = "";
00510 } else {
00511 if(m_codec->mibEnum() != 1000)
00512 {
00513
00514
00515 char *d = const_cast<char *>(data);
00516 int i = len - 1;
00517 while(i >= 0) {
00518 if(*(d+i) == 0) *(d+i) = ' ';
00519 i--;
00520 }
00521 }
00522 out = m_decoder->toUnicode(data, len);
00523 }
00524
00525
00526
00527 if(out[out.length()-1] == QChar::null)
00528 assert(0);
00529 return out;
00530 }
00531
00532 QString Decoder::flush() const
00533 {
00534 return m_decoder->toUnicode(buffer, buffer.length());
00535 }
00536
00537
00538 #undef DECODE_DEBUG