khtml Library API Documentation

decoder.cpp

00001 /*
00002     This file is part of the KDE libraries
00003 
00004     Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
00005 
00006     This library is free software; you can redistribute it and/or
00007     modify it under the terms of the GNU Library General Public
00008     License as published by the Free Software Foundation; either
00009     version 2 of the License, or (at your option) any later version.
00010 
00011     This library is distributed in the hope that it will be useful,
00012     but WITHOUT ANY WARRANTY; without even the implied warranty of
00013     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014     Library General Public License for more details.
00015 
00016     You should have received a copy of the GNU Library General Public License
00017     along with this library; see the file COPYING.LIB.  If not, write to
00018     the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00019     Boston, MA 02111-1307, USA.
00020 */
00021 //----------------------------------------------------------------------------
00022 //
00023 // KDE HTML Widget -- decoder for input stream
00024 // $Id: decoder.cpp,v 1.59 2002/11/22 03:05:38 mueller Exp $
00025 
00026 #undef DECODE_DEBUG
00027 //#define DECODE_DEBUG
00028 
00029 #include <assert.h>
00030 
00031 #include "decoder.h"
00032 using namespace khtml;
00033 
00034 #include "htmlhashes.h"
00035 
00036 #include <qregexp.h>
00037 #include <qtextcodec.h>
00038 
00039 #include <kglobal.h>
00040 #include <kcharsets.h>
00041 
00042 #include <ctype.h>
00043 #include <kdebug.h>
00044 #include <klocale.h>
00045 
00046 class KanjiCode
00047 {
00048 public:
00049     enum Type {ASCII, JIS, EUC, SJIS, UNICODE, UTF8 };
00050     static enum Type judge(const char *str);
00051     static const int ESC;
00052     static const int _SS2_;
00053     static const unsigned char kanji_map_sjis[];
00054     static int ISkanji(int code)
00055     {
00056         if (code >= 0x100)
00057                     return 0;
00058         return (kanji_map_sjis[code & 0xff] & 1);
00059     }
00060 
00061     static int ISkana(int code)
00062     {
00063         if (code >= 0x100)
00064                     return 0;
00065         return (kanji_map_sjis[code & 0xff] & 2);
00066     }
00067 
00068 };
00069 
00070 const int KanjiCode::ESC = 0x1b;
00071 const int KanjiCode::_SS2_ = 0x8e;
00072 
00073 const unsigned char KanjiCode::kanji_map_sjis[] =
00074 {
00075     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00076     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00077     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00078     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00079     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00080     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00081     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00082     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00083     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00084     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00085     0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00086     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00087     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00088     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00089     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00090     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
00091 };
00092 
00093 /*
00094  * EUC-JP is
00095  *     [0xa1 - 0xfe][0xa1 - 0xfe]
00096  *     0x8e[0xa1 - 0xfe](SS2)
00097  *     0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
00098  *
00099  * Shift_Jis is
00100  *     [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
00101  *
00102  * Shift_Jis Hankaku Kana is
00103  *     [0xa1 - 0xdf]
00104  */
00105 
00106 /*
00107  * KanjiCode::judge() is based on judge_jcode() from jvim
00108  *     http://hp.vector.co.jp/authors/VA003457/vim/
00109  *
00110  * Special Thanks to Kenichi Tsuchida
00111  */
00112 
00113 /*
00114  * Maybe we should use QTextCodec::heuristicContentMatch()
00115  * But it fails detection. It's not useful.
00116  */
00117 
00118 enum KanjiCode::Type KanjiCode::judge(const char *str)
00119 {
00120     enum Type code;
00121     int i;
00122     int bfr = FALSE;            /* Kana Moji */
00123     int bfk = 0;                /* EUC Kana */
00124     int sjis = 0;
00125     int euc = 0;
00126 
00127     const unsigned char *ptr = (const unsigned char *) str;
00128     int size = strlen(str);
00129 
00130     code = ASCII;
00131 
00132     i = 0;
00133     while (i < size) {
00134         if (ptr[i] == ESC && (size - i >= 3)) {
00135             if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
00136             || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
00137                 code = JIS;
00138                 goto breakBreak;
00139             } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
00140                     || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
00141                 code = JIS;
00142                 goto breakBreak;
00143             } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
00144                 code = JIS;
00145                 i += 3;
00146             } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
00147                 code = JIS;
00148                 i += 3;
00149             } else {
00150                 i++;
00151             }
00152             bfr = FALSE;
00153             bfk = 0;
00154         } else {
00155             if (ptr[i] < 0x20) {
00156                 bfr = FALSE;
00157                 bfk = 0;
00158                 /* ?? check kudokuten ?? && ?? hiragana ?? */
00159                 if ((i >= 2) && (ptr[i - 2] == 0x81)
00160                         && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
00161                     code = SJIS;
00162                     sjis += 100;        /* kudokuten */
00163                 } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
00164                         && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
00165                     code = EUC;
00166                     euc += 100;         /* kudokuten */
00167                 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
00168                     sjis += 40;         /* hiragana */
00169                 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
00170                     euc += 40;  /* hiragana */
00171                 }
00172             } else {
00173                 /* ?? check hiragana or katana ?? */
00174                 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
00175                     sjis++;     /* hiragana */
00176                 } else if ((size - i > 1) && (ptr[i] == 0x83)
00177                          && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
00178                     sjis++;     /* katakana */
00179                 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
00180                     euc++;      /* hiragana */
00181                 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
00182                     euc++;      /* katakana */
00183                 }
00184                 if (bfr) {
00185                     if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
00186                         code = SJIS;
00187                         goto breakBreak;
00188                     } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
00189                         code = SJIS;
00190                         goto breakBreak;
00191                     } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
00192                         code = EUC;
00193                         goto breakBreak;
00194                     } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
00195                         code = EUC;
00196                         goto breakBreak;
00197                     } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
00198                         code = SJIS;
00199                         goto breakBreak;
00200                     } else if (ptr[i] <= 0x7f) {
00201                         code = SJIS;
00202                         goto breakBreak;
00203                     } else {
00204                         if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
00205                             euc++;      /* sjis hankaku kana kigo */
00206                         } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
00207                             ;   /* sjis hankaku kana */
00208                         } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
00209                             euc++;
00210                         } else if (0x8e == ptr[i]) {
00211                             euc++;
00212                         } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
00213                             sjis++;
00214                         }
00215                         bfr = FALSE;
00216                         bfk = 0;
00217                     }
00218                 } else if (0x8e == ptr[i]) {
00219                     if (size - i <= 1) {
00220                         ;
00221                     } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
00222                         /* EUC KANA or SJIS KANJI */
00223                         if (bfk == 1) {
00224                             euc += 100;
00225                         }
00226                         bfk++;
00227                         i++;
00228                     } else {
00229                         /* SJIS only */
00230                         code = SJIS;
00231                         goto breakBreak;
00232                     }
00233                 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
00234                     /* SJIS only */
00235                     code = SJIS;
00236                     if ((size - i >= 1)
00237                             && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
00238                             || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
00239                         goto breakBreak;
00240                     }
00241                 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
00242                     /* EUC only */
00243                     code = EUC;
00244                     if ((size - i >= 1)
00245                             && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
00246                         goto breakBreak;
00247                     }
00248                 } else if (ptr[i] <= 0x7f) {
00249                     ;
00250                 } else {
00251                     bfr = TRUE;
00252                     bfk = 0;
00253                 }
00254             }
00255             i++;
00256         }
00257     }
00258     if (code == ASCII) {
00259         if (sjis > euc) {
00260             code = SJIS;
00261         } else if (sjis < euc) {
00262             code = EUC;
00263         }
00264     }
00265 breakBreak:
00266     return (code);
00267 }
00268 
00269 Decoder::Decoder()
00270 {
00271     // latin1
00272     m_codec = QTextCodec::codecForMib(4);
00273     m_decoder = m_codec->makeDecoder();
00274     enc = 0;
00275     body = false;
00276     beginning = true;
00277     visualRTL = false;
00278     haveEncoding = false;
00279 }
00280 Decoder::~Decoder()
00281 {
00282     delete m_decoder;
00283 }
00284 
00285 void Decoder::setEncoding(const char *_encoding, bool force)
00286 {
00287 #ifdef DECODE_DEBUG
00288     kdDebug(6005) << "setEncoding " << _encoding << " " << force << endl;
00289 #endif
00290     enc = _encoding;
00291 
00292     QTextCodec *old = m_codec;
00293 #ifdef DECODE_DEBUG
00294     kdDebug(6005) << "old encoding is:" << m_codec->name() << endl;
00295 #endif
00296     enc = enc.lower();
00297 #ifdef DECODE_DEBUG
00298     kdDebug(6005) << "requesting:" << enc << endl;
00299 #endif
00300     if(enc.isNull() || enc.isEmpty())
00301         return;
00302     if(enc == "visual") // hebrew visually ordered
00303         enc = "iso8859-8";
00304     bool b;
00305     m_codec = KGlobal::charsets()->codecForName(enc, b);
00306     if(m_codec->mibEnum() == 11)  {
00307         // iso8859-8 (visually ordered)
00308         m_codec = QTextCodec::codecForName("iso8859-8-i");
00309         visualRTL = true;
00310     }
00311     if( !b ) // in case the codec didn't exist, we keep the old one (fixes some sites specifying invalid codecs)
00312         m_codec = old;
00313     else
00314         haveEncoding = force;
00315     delete m_decoder;
00316     m_decoder = m_codec->makeDecoder();
00317     if (m_codec->mibEnum() == 1000) // utf 16
00318         haveEncoding = false; // force auto detection
00319 #ifdef DECODE_DEBUG
00320     kdDebug(6005) << "Decoder::encoding used is" << m_codec->name() << endl;
00321 #endif
00322 }
00323 
00324 const char *Decoder::encoding() const
00325 {
00326     return enc;
00327 }
00328 
00329 QString Decoder::decode(const char *data, int len)
00330 {
00331     // this is not completely efficient, since the function might go
00332     // through the html head several times...
00333 
00334     if(!haveEncoding && !body) {
00335 #ifdef DECODE_DEBUG
00336         kdDebug(6005) << "looking for charset definition" << endl;
00337 #endif
00338         // check for UTF-16
00339         uchar * uchars = (uchar *) data;
00340         if( uchars[0] == 0xfe && uchars[1] == 0xff ||
00341             uchars[0] == 0xff && uchars[1] == 0xfe ) {
00342             enc = "ISO-10646-UCS-2";
00343             haveEncoding = true;
00344             m_codec = QTextCodec::codecForMib(1000);
00345             delete m_decoder;
00346             m_decoder = m_codec->makeDecoder();
00347         } else {
00348 
00349             if(m_codec->mibEnum() != 1000) {  // utf16
00350                 // replace '\0' by spaces, for buggy pages
00351                 char *d = const_cast<char *>(data);
00352                 int i = len - 1;
00353                 while(i >= 0) {
00354                     if(d[i] == 0) d[i] = ' ';
00355                     i--;
00356                 }
00357             }
00358             buffer += QCString(data, len+1);
00359 
00360             // we still don't have an encoding, and are in the head
00361             // the following tags are allowed in <head>:
00362             // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
00363 
00364             const char *ptr = buffer.data();
00365             while(*ptr != '\0')
00366             {
00367                 if(*ptr == '<') {
00368                     bool end = false;
00369                     ptr++;
00370                     if(*ptr == '/') ptr++, end=true;
00371                     char tmp[20];
00372                     int len = 0;
00373                     while (
00374                         ((*ptr >= 'a') && (*ptr <= 'z') ||
00375                          (*ptr >= 'A') && (*ptr <= 'Z') ||
00376                          (*ptr >= '0') && (*ptr <= '9'))
00377                         && len < 19 )
00378                     {
00379                         tmp[len] = tolower( *ptr );
00380                         ptr++;
00381                         len++;
00382                     }
00383                     tmp[len] = 0;
00384                     int id = khtml::getTagID(tmp, len);
00385                     if(end) id += ID_CLOSE_TAG;
00386 
00387                     switch( id ) {
00388                     case ID_META:
00389                     {
00390                         // found a meta tag...
00391                         //ptr += 5;
00392                         const char * end = ptr;
00393                         while(*end != '>' && *end != '\0') end++;
00394                         if ( *end == '\0' ) break;
00395                         QCString str( ptr, (end-ptr)+1);
00396                         str = str.lower();
00397                         int pos = 0;
00398                         //if( (pos = str.find("http-equiv", pos)) == -1) break;
00399                         //if( (pos = str.find("content-type", pos)) == -1) break;
00400                         while( pos < ( int ) str.length() ) {
00401                             if( (pos = str.find("charset", pos)) == -1) break;
00402                             pos += 7;
00403                             // skip whitespace..
00404                             while(  pos < (int)str.length() && str[pos] <= ' ' ) pos++;
00405                             if ( pos == ( int )str.length()) break;
00406                             if ( str[pos++] != '=' ) continue;
00407                             while ( pos < ( int )str.length() &&
00408                                     ( str[pos] <= ' ' ) || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'')
00409                                 pos++;
00410 
00411                             // end ?
00412                             if ( pos == ( int )str.length() ) break;
00413                             uint endpos = pos;
00414                             while( endpos < str.length() &&
00415                                    (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
00416                                     && str[endpos] != ';' && str[endpos] != '>') )
00417                                 endpos++;
00418                             enc = str.mid(pos, endpos-pos);
00419 #ifdef DECODE_DEBUG
00420                             kdDebug( 6005 ) << "Decoder: found charset: " << enc.data() << endl;
00421 #endif
00422                             setEncoding(enc, true);
00423                             if( haveEncoding ) goto found;
00424 
00425                             if ( endpos >= str.length() || str[endpos] == '/' || str[endpos] == '>' ) break;
00426 
00427                             pos = endpos + 1;
00428                         }
00429                     }
00430                     case ID_SCRIPT:
00431                     case (ID_SCRIPT+ID_CLOSE_TAG):
00432                     case ID_NOSCRIPT:
00433                     case (ID_NOSCRIPT+ID_CLOSE_TAG):
00434                     case ID_STYLE:
00435                     case (ID_STYLE+ID_CLOSE_TAG):
00436                     case ID_LINK:
00437                     case (ID_LINK+ID_CLOSE_TAG):
00438                     case ID_OBJECT:
00439                     case (ID_OBJECT+ID_CLOSE_TAG):
00440                     case ID_TITLE:
00441                     case (ID_TITLE+ID_CLOSE_TAG):
00442                     case ID_BASE:
00443                     case (ID_BASE+ID_CLOSE_TAG):
00444                     case ID_HTML:
00445                     case ID_HEAD:
00446                     case 0:
00447                     case (0 + ID_CLOSE_TAG ):
00448                         break;
00449                     default:
00450                         body = true;
00451 #ifdef DECODE_DEBUG
00452                         kdDebug( 6005 ) << "Decoder: no charset found. Id=" << id << endl;
00453 #endif
00454                         goto found;
00455                     }
00456                 }
00457                 else
00458                     ptr++;
00459             }
00460             return QString::null;
00461         }
00462     }
00463 
00464  found:
00465     if (!haveEncoding && KGlobal::locale()->languageList()[0] == "ja") {
00466 #ifdef DECODE_DEBUG
00467         kdDebug( 6005 ) << "Decoder: use auto-detect (" << strlen(data) << ")" << endl;
00468 #endif
00469         switch ( KanjiCode::judge( data ) ) {
00470         case KanjiCode::JIS:
00471             enc = "jis7";
00472             break;
00473         case KanjiCode::EUC:
00474             enc = "eucjp";
00475             break;
00476         case KanjiCode::SJIS:
00477             enc = "sjis";
00478             break;
00479         default:
00480             enc = NULL;
00481             break;
00482         }
00483 #ifdef DECODE_DEBUG
00484         kdDebug( 6005 ) << "Decoder: auto detect encoding is " << enc << endl;
00485 #endif
00486         if (!enc.isEmpty()) {
00487             setEncoding(enc, true);
00488         }
00489     }
00490 
00491     // if we still haven't found an encoding latin1 will be used...
00492     // this is according to HTML4.0 specs
00493     if (!m_codec)
00494     {
00495         if(enc.isEmpty()) enc = "iso8859-1";
00496         m_codec = QTextCodec::codecForName(enc);
00497         // be sure not to crash
00498         if(!m_codec) {
00499             m_codec = QTextCodec::codecForMib(4);
00500             enc = "iso8859-1";
00501         }
00502         delete m_decoder;
00503         m_decoder = m_codec->makeDecoder();
00504     }
00505     QString out;
00506 
00507     if(!buffer.isEmpty() && enc != "ISO-10646-UCS-2") {
00508         out = m_decoder->toUnicode(buffer, buffer.length());
00509         buffer = "";
00510     } else {
00511         if(m_codec->mibEnum() != 1000) // utf16
00512         {
00513             // ### hack for a bug in QTextCodec. It cut's the input stream
00514             // in case there are \0 in it. ZDNET has them inside... :-(
00515             char *d = const_cast<char *>(data);
00516             int i = len - 1;
00517             while(i >= 0) {
00518                 if(*(d+i) == 0) *(d+i) = ' ';
00519                 i--;
00520             }
00521         }
00522         out = m_decoder->toUnicode(data, len);
00523     }
00524 
00525     // the hell knows, why the output does sometimes have a QChar::null at
00526     // the end...
00527     if(out[out.length()-1] == QChar::null)
00528         assert(0);
00529     return out;
00530 }
00531 
00532 QString Decoder::flush() const
00533 {
00534     return m_decoder->toUnicode(buffer, buffer.length());
00535 }
00536 
00537 // -----------------------------------------------------------------------------
00538 #undef DECODE_DEBUG
KDE Logo
This file is part of the documentation for kdelibs Version 3.1.5.
Documentation copyright © 1996-2002 the KDE developers.
Generated on Wed Jan 28 13:33:35 2004 by doxygen 1.3.4 written by Dimitri van Heesch, © 1997-2001