khtml Library API Documentation

htmltokenizer.cpp

00001 /*
00002     This file is part of the KDE libraries
00003 
00004     Copyright (C) 1997 Martin Jones (mjones@kde.org)
00005               (C) 1997 Torben Weis (weis@kde.org)
00006               (C) 1998 Waldo Bastian (bastian@kde.org)
00007               (C) 1999 Lars Knoll (knoll@kde.org)
00008               (C) 1999 Antti Koivisto (koivisto@kde.org)
00009               (C) 2001-2003 Dirk Mueller (mueller@kde.org)
00010               (C) 2002 Apple Computer, Inc.
00011 
00012     This library is free software; you can redistribute it and/or
00013     modify it under the terms of the GNU Library General Public
00014     License as published by the Free Software Foundation; either
00015     version 2 of the License, or (at your option) any later version.
00016 
00017     This library is distributed in the hope that it will be useful,
00018     but WITHOUT ANY WARRANTY; without even the implied warranty of
00019     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00020     Library General Public License for more details.
00021 
00022     You should have received a copy of the GNU Library General Public License
00023     along with this library; see the file COPYING.LIB.  If not, write to
00024     the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00025     Boston, MA 02111-1307, USA.
00026 */
00027 //----------------------------------------------------------------------------
00028 //
00029 // KDE HTML Widget - Tokenizers
00030 // $Id: htmltokenizer.cpp,v 1.248.2.7 2003/08/29 11:41:51 mueller Exp $
00031 
00032 //#define TOKEN_DEBUG 1
00033 //#define TOKEN_DEBUG 2
00034 
00035 #ifdef HAVE_CONFIG_H
00036 #include "config.h"
00037 #endif
00038 
00039 //#include <string.h>
00040 #include "html/htmltokenizer.h"
00041 #include "html/html_documentimpl.h"
00042 #include "html/htmlparser.h"
00043 #include "html/dtd.h"
00044 
00045 #include "misc/loader.h"
00046 #include "misc/htmlhashes.h"
00047 
00048 #include "khtmlview.h"
00049 #include "khtml_part.h"
00050 #include "xml/dom_docimpl.h"
00051 #include "css/csshelper.h"
00052 #include "ecma/kjs_proxy.h"
00053 #include <kcharsets.h>
00054 #include <kglobal.h>
00055 #include <ctype.h>
00056 #include <assert.h>
00057 #include <qvariant.h>
00058 #include <kdebug.h>
00059 #include <stdlib.h>
00060 
00061 #include "kentities.c"
00062 
00063 using namespace khtml;
00064 
00065 static const QChar commentStart [] = { '<','!','-','-', QChar::null };
00066 
00067 static const char scriptEnd [] = "</script";
00068 static const char xmpEnd [] = "</xmp";
00069 static const char styleEnd [] =  "</style";
00070 static const char textareaEnd [] = "</textarea";
00071 static const char titleEnd [] = "</title";
00072 
00073 #define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) )
00074 #define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) P = realloc(p, sizeof(QChar)*( N ))
00075 #define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P ))
00076 
00077 // Full support for MS Windows extensions to Latin-1.
00078 // Technically these extensions should only be activated for pages
00079 // marked "windows-1252" or "cp1252", but
00080 // in the standard Microsoft way, these extensions infect hundreds of thousands
00081 // of web pages.  Note that people with non-latin-1 Microsoft extensions
00082 // are SOL.
00083 //
00084 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
00085 //      http://www.bbsinc.com/iso8859.html
00086 //      http://www.obviously.com/
00087 //
00088 // There may be better equivalents
00089 #if 0
00090 #define fixUpChar(x)
00091 #else
00092 #define fixUpChar(x) \
00093             if (!(x).row() ) { \
00094                 switch ((x).cell()) \
00095                 { \
00096                 /* ALL of these should be changed to Unicode SOON */ \
00097                 case 0x80: (x) = 0x20ac; break; \
00098                 case 0x82: (x) = ',';    break; \
00099                 case 0x83: (x) = 0x0192; break; \
00100                 case 0x84: (x) = '"';    break; \
00101                 case 0x85: (x) = 0x2026; break; \
00102                 case 0x86: (x) = 0x2020; break; \
00103                 case 0x87: (x) = 0x2021; break; \
00104                 case 0x88: (x) = 0x02C6; break; \
00105                 case 0x89: (x) = 0x2030; break; \
00106                 case 0x8A: (x) = 0x0160; break; \
00107                 case 0x8b: (x) = '<';    break; \
00108                 case 0x8C: (x) = 0x0152; break; \
00109 \
00110                 case 0x8E: (x) = 0x017D; break; \
00111 \
00112 \
00113                 case 0x91: (x) = '\'';   break; \
00114                 case 0x92: (x) = '\'';   break; \
00115                 case 0x93: (x) = '"';    break; \
00116                 case 0x94: (x) = '"';    break; \
00117                 case 0x95: (x) = '*';    break; \
00118                 case 0x96: (x) = '-';    break; \
00119                 case 0x97: (x) = '-';    break; \
00120                 case 0x98: (x) = '~';    break; \
00121                 case 0x99: (x) = 0x2122; break; \
00122                 case 0x9A: (x) = 0x0161; break; \
00123                 case 0x9b: (x) = '>';    break; \
00124                 case 0x9C: (x) = 0x0153; break; \
00125 \
00126                 case 0x9E: (x) = 0x017E; break; \
00127                 case 0x9F: (x) = 0x0178; break; \
00128                 default: break; \
00129                 } \
00130             }
00131 #endif
00132 
00133 // ----------------------------------------------------------------------------
00134 
00135 HTMLTokenizer::HTMLTokenizer(DOM::DocumentPtr *_doc, KHTMLView *_view)
00136 {
00137     view = _view;
00138     buffer = 0;
00139     scriptCode = 0;
00140     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
00141     charsets = KGlobal::charsets();
00142     parser = new KHTMLParser(_view, _doc);
00143     m_executingScript = 0;
00144     onHold = false;
00145 
00146     reset();
00147 }
00148 
00149 HTMLTokenizer::HTMLTokenizer(DOM::DocumentPtr *_doc, DOM::DocumentFragmentImpl *i)
00150 {
00151     view = 0;
00152     buffer = 0;
00153     scriptCode = 0;
00154     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
00155     charsets = KGlobal::charsets();
00156     parser = new KHTMLParser( i, _doc );
00157     m_executingScript = 0;
00158     onHold = false;
00159 
00160     reset();
00161 }
00162 
00163 void HTMLTokenizer::reset()
00164 {
00165     assert(m_executingScript == 0);
00166     assert(onHold == false);
00167 
00168     while (!cachedScript.isEmpty())
00169         cachedScript.dequeue()->deref(this);
00170 
00171     if ( buffer )
00172         KHTML_DELETE_QCHAR_VEC(buffer);
00173     buffer = dest = 0;
00174     size = 0;
00175 
00176     if ( scriptCode )
00177         KHTML_DELETE_QCHAR_VEC(scriptCode);
00178     scriptCode = 0;
00179     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
00180 
00181     currToken.reset();
00182 }
00183 
00184 void HTMLTokenizer::begin()
00185 {
00186     m_executingScript = 0;
00187     onHold = false;
00188     reset();
00189     size = 254;
00190     buffer = KHTML_ALLOC_QCHAR_VEC( 255 );
00191     dest = buffer;
00192     tag = NoTag;
00193     pending = NonePending;
00194     discard = NoneDiscard;
00195     pre = false;
00196     prePos = 0;
00197     plaintext = false;
00198     xmp = false;
00199     processingInstruction = false;
00200     script = false;
00201     escaped = false;
00202     style = false;
00203     skipLF = false;
00204     select = false;
00205     comment = false;
00206     server = false;
00207     textarea = false;
00208     title = false;
00209     startTag = false;
00210     tquote = NoQuote;
00211     searchCount = 0;
00212     Entity = NoEntity;
00213     noMoreData = false;
00214     brokenComments = false;
00215     brokenServer = false;
00216     brokenScript = false;
00217     lineno = 0;
00218     scriptStartLineno = 0;
00219     tagStartLineno = 0;
00220 }
00221 
00222 void HTMLTokenizer::processListing(DOMStringIt list)
00223 {
00224     bool old_pre = pre;
00225 
00226     // This function adds the listing 'list' as
00227     // preformatted text-tokens to the token-collection
00228     // thereby converting TABs.
00229     if(!style) pre = true;
00230     prePos = 0;
00231 
00232     while ( list.length() )
00233     {
00234         checkBuffer(3*TAB_SIZE);
00235 
00236         if (skipLF && ( *list != '\n' ))
00237         {
00238             skipLF = false;
00239         }
00240 
00241         if (skipLF)
00242         {
00243             skipLF = false;
00244             ++list;
00245         }
00246         else if (( *list == '\n' ) || ( *list == '\r' ))
00247         {
00248             if (discard == LFDiscard)
00249             {
00250                 // Ignore this LF
00251                 discard = NoneDiscard; // We have discarded 1 LF
00252             }
00253             else
00254             {
00255                 // Process this LF
00256                 if (pending)
00257                     addPending();
00258                 pending = LFPending;
00259             }
00260             /* Check for MS-DOS CRLF sequence */
00261             if (*list == '\r')
00262             {
00263                 skipLF = true;
00264             }
00265             ++list;
00266         }
00267         else if (( *list == ' ' ) || ( *list == '\t'))
00268         {
00269             if (pending)
00270                 addPending();
00271             if (*list == ' ')
00272                 pending = SpacePending;
00273             else
00274                 pending = TabPending;
00275 
00276             ++list;
00277         }
00278         else
00279         {
00280             discard = NoneDiscard;
00281             if (pending)
00282                 addPending();
00283 
00284             prePos++;
00285             *dest++ = *list;
00286             ++list;
00287         }
00288 
00289     }
00290 
00291     if ((pending == SpacePending) || (pending == TabPending))
00292         addPending();
00293     else
00294         pending = NonePending;
00295 
00296     prePos = 0;
00297     pre = old_pre;
00298 }
00299 
00300 void HTMLTokenizer::parseSpecial(DOMStringIt &src)
00301 {
00302     assert( textarea || title || !Entity );
00303     assert( !tag );
00304     assert( xmp+textarea+title+style+script == 1 );
00305     if (script)
00306         scriptStartLineno = lineno+src.lineCount();
00307 
00308     if ( comment ) parseComment( src );
00309 
00310     while ( src.length() ) {
00311         checkScriptBuffer();
00312         unsigned char ch = src->latin1();
00313         if ( !scriptCodeResync && !brokenComments && !textarea && !xmp && !title && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && QConstString( scriptCode+scriptCodeSize-3, 3 ).string() == "<!-" ) {
00314             comment = true;
00315             parseComment( src );
00316             continue;
00317         }
00318         if ( scriptCodeResync && !tquote && ( ch == '>' ) ) {
00319             ++src;
00320             scriptCodeSize = scriptCodeResync-1;
00321             scriptCodeResync = 0;
00322             scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0;
00323             if ( script )
00324                 scriptHandler();
00325             else {
00326                 processListing(DOMStringIt(scriptCode, scriptCodeSize));
00327                 processToken();
00328                 if ( style )         { currToken.id = ID_STYLE + ID_CLOSE_TAG; }
00329                 else if ( textarea ) { currToken.id = ID_TEXTAREA + ID_CLOSE_TAG; }
00330                 else if ( title ) { currToken.id = ID_TITLE + ID_CLOSE_TAG; }
00331                 else if ( xmp )  { currToken.id = ID_XMP + ID_CLOSE_TAG; }
00332                 processToken();
00333                 style = script = style = textarea = title = xmp = false;
00334                 tquote = NoQuote;
00335                 scriptCodeSize = scriptCodeResync = 0;
00336             }
00337             return;
00338         }
00339         // possible end of tagname, lets check.
00340         if ( !scriptCodeResync && !escaped && !src.escaped() && ( ch == '>' || ch == '/' || ch <= ' ' ) && ch &&
00341              scriptCodeSize >= searchStopperLen &&
00342              !QConstString( scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen ).string().find( searchStopper, 0, false )) {
00343             scriptCodeResync = scriptCodeSize-searchStopperLen+1;
00344             tquote = NoQuote;
00345             continue;
00346         }
00347         if ( scriptCodeResync && !escaped ) {
00348             if(ch == '\"')
00349                 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
00350             else if(ch == '\'')
00351                 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
00352             else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
00353                 tquote = NoQuote;
00354         }
00355         escaped = ( !escaped && ch == '\\' );
00356         if (!scriptCodeResync && (textarea||title) && !src.escaped() && ch == '&') {
00357             QChar *scriptCodeDest = scriptCode+scriptCodeSize;
00358             ++src;
00359             parseEntity(src,scriptCodeDest,true);
00360             scriptCodeSize = scriptCodeDest-scriptCode;
00361         }
00362         else {
00363             scriptCode[ scriptCodeSize++ ] = *src;
00364             ++src;
00365         }
00366     }
00367 }
00368 
00369 void HTMLTokenizer::scriptHandler()
00370 {
00371     QString currentScriptSrc = scriptSrc;
00372     scriptSrc = QString::null;
00373 
00374     processListing(DOMStringIt(scriptCode, scriptCodeSize));
00375     QString exScript( buffer, dest-buffer );
00376 
00377     processToken();
00378     currToken.id = ID_SCRIPT + ID_CLOSE_TAG;
00379     processToken();
00380 
00381     QString prependingSrc;
00382 
00383     if ( !parser->skipMode() ) {
00384         CachedScript* cs = 0;
00385 
00386         // forget what we just got, load from src url instead
00387         if ( !currentScriptSrc.isEmpty() &&
00388              (cs = parser->doc()->docLoader()->requestScript(currentScriptSrc, scriptSrcCharset) ))
00389             cachedScript.enqueue(cs);
00390 
00391         if (cs) {
00392             pendingSrc.prepend( QString(src.current(), src.length() ) );
00393             setSrc(QString::null);
00394             scriptCodeSize = scriptCodeResync = 0;
00395             cs->ref(this);
00396 
00397         }
00398         else if (currentScriptSrc.isEmpty() && view && javascript ) {
00399             if ( !m_executingScript )
00400                 pendingSrc.prepend( QString( src.current(), src.length() ) ); // deep copy - again
00401             else
00402                 prependingSrc = QString( src.current(), src.length() ); // deep copy
00403 
00404             setSrc(QString::null);
00405             scriptCodeSize = scriptCodeResync = 0;
00406             scriptExecution( exScript, QString::null, tagStartLineno /*scriptStartLineno*/ );
00407         }
00408     }
00409 
00410     script = false;
00411     scriptCodeSize = scriptCodeResync = 0;
00412 
00413     if ( !m_executingScript && cachedScript.isEmpty() ) {
00414         // kdDebug( 6036 ) << "adding pending Output to parsed string" << endl;
00415         QString newStr = QString(src.current(), src.length());
00416         newStr += pendingSrc;
00417         setSrc(newStr);
00418         pendingSrc = QString::null;
00419     }
00420     else if ( !prependingSrc.isEmpty() )
00421         write( prependingSrc, false );
00422 }
00423 
00424 void HTMLTokenizer::scriptExecution( const QString& str, QString scriptURL,
00425                                      int baseLine)
00426 {
00427     bool oldscript = script;
00428     m_executingScript++;
00429     script = false;
00430     QString url;
00431     if (scriptURL.isNull())
00432       url = static_cast<DocumentImpl*>(view->part()->document().handle())->URL();
00433     else
00434       url = scriptURL;
00435 
00436     view->part()->executeScript(url,baseLine,Node(),str);
00437     m_executingScript--;
00438     script = oldscript;
00439 }
00440 
00441 void HTMLTokenizer::parseComment(DOMStringIt &src)
00442 {
00443     checkScriptBuffer(src.length());
00444     while ( src.length() ) {
00445         scriptCode[ scriptCodeSize++ ] = *src;
00446 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00447         qDebug("comment is now: *%s*",
00448                QConstString((QChar*)src.current(), QMIN(16, src.length())).string().latin1());
00449 #endif
00450         if (src->unicode() == '>' &&
00451             ( ( brokenComments && !( script || style ) ) ||
00452               ( scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' &&
00453                 scriptCode[scriptCodeSize-2] == '-' ) ) ) {
00454             ++src;
00455             if ( !( script || xmp || textarea || style) ) {
00456 #ifdef COMMENTS_IN_DOM
00457                 checkScriptBuffer();
00458                 scriptCode[ scriptCodeSize ] = 0;
00459                 scriptCode[ scriptCodeSize + 1 ] = 0;
00460                 currToken.id = ID_COMMENT;
00461                 processListing(DOMStringIt(scriptCode, scriptCodeSize - 2));
00462                 processToken();
00463                 currToken.id = ID_COMMENT + ID_CLOSE_TAG;
00464                 processToken();
00465 #endif
00466                 scriptCodeSize = 0;
00467             }
00468             comment = false;
00469             return; // Finished parsing comment
00470         }
00471         ++src;
00472     }
00473 }
00474 
00475 void HTMLTokenizer::parseServer(DOMStringIt &src)
00476 {
00477     checkScriptBuffer(src.length());
00478     while ( src.length() ) {
00479         scriptCode[ scriptCodeSize++ ] = *src;
00480         if (src->unicode() == '>' &&
00481             scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
00482             ++src;
00483             server = false;
00484             scriptCodeSize = 0;
00485             return; // Finished parsing server include
00486         }
00487         ++src;
00488     }
00489 }
00490 
00491 void HTMLTokenizer::parseProcessingInstruction(DOMStringIt &src)
00492 {
00493     char oldchar = 0;
00494     while ( src.length() )
00495     {
00496         unsigned char chbegin = src->latin1();
00497         if(chbegin == '\'') {
00498             tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
00499         }
00500         else if(chbegin == '\"') {
00501             tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
00502         }
00503         // Look for '?>'
00504         // some crappy sites omit the "?" before it, so
00505         // we look for an unquoted '>' instead. (IE compatible)
00506         else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) )
00507         {
00508             // We got a '?>' sequence
00509             processingInstruction = false;
00510             ++src;
00511             discard=LFDiscard;
00512             return; // Finished parsing comment!
00513         }
00514         ++src;
00515         oldchar = chbegin;
00516     }
00517 }
00518 
00519 void HTMLTokenizer::parseText(DOMStringIt &src)
00520 {
00521     while ( src.length() )
00522     {
00523         // do we need to enlarge the buffer?
00524         checkBuffer();
00525 
00526         // ascii is okay because we only do ascii comparisons
00527         unsigned char chbegin = src->latin1();
00528 
00529         if (skipLF && ( chbegin != '\n' ))
00530         {
00531             skipLF = false;
00532         }
00533 
00534         if (skipLF)
00535         {
00536             skipLF = false;
00537             ++src;
00538         }
00539         else if (( chbegin == '\n' ) || ( chbegin == '\r' ))
00540         {
00541             if (chbegin == '\r')
00542                 skipLF = true;
00543 
00544             *dest++ = '\n';
00545             ++src;
00546         }
00547         else {
00548             *dest++ = *src;
00549             ++src;
00550         }
00551     }
00552 }
00553 
00554 
00555 void HTMLTokenizer::parseEntity(DOMStringIt &src, QChar *&dest, bool start)
00556 {
00557     if( start )
00558     {
00559         cBufferPos = 0;
00560         Entity = SearchEntity;
00561     }
00562 
00563     while( src.length() )
00564     {
00565         ushort cc = src->unicode();
00566         switch(Entity) {
00567         case NoEntity:
00568             return;
00569 
00570             break;
00571         case SearchEntity:
00572             if(cc == '#') {
00573                 cBuffer[cBufferPos++] = cc;
00574                 ++src;
00575                 Entity = NumericSearch;
00576             }
00577             else
00578                 Entity = EntityName;
00579 
00580             break;
00581 
00582         case NumericSearch:
00583             if(cc == 'x' || cc == 'X') {
00584                 cBuffer[cBufferPos++] = cc;
00585                 ++src;
00586                 Entity = Hexadecimal;
00587             }
00588             else if(cc >= '0' && cc <= '9')
00589                 Entity = Decimal;
00590             else
00591                 Entity = SearchSemicolon;
00592 
00593             break;
00594 
00595         case Hexadecimal:
00596         {
00597             int uc = EntityChar.unicode();
00598             int ll = kMin(src.length(), 9-cBufferPos);
00599             while(ll--) {
00600                 QChar csrc(src->lower());
00601                 cc = csrc.cell();
00602 
00603                 if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
00604                     Entity = SearchSemicolon;
00605                     break;
00606                 }
00607                 uc = uc*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10));
00608                 cBuffer[cBufferPos++] = cc;
00609                 ++src;
00610             }
00611             EntityChar = QChar(uc);
00612             if(cBufferPos == 9) Entity = SearchSemicolon;
00613             break;
00614         }
00615         case Decimal:
00616         {
00617             int uc = EntityChar.unicode();
00618             int ll = kMin(src.length(), 9-cBufferPos);
00619             while(ll--) {
00620                 cc = src->cell();
00621 
00622                 if(src->row() || !(cc >= '0' && cc <= '9')) {
00623                     Entity = SearchSemicolon;
00624                     break;
00625                 }
00626 
00627                 uc = uc * 10 + (cc - '0');
00628                 cBuffer[cBufferPos++] = cc;
00629                 ++src;
00630             }
00631             EntityChar = QChar(uc);
00632             if(cBufferPos == 9)  Entity = SearchSemicolon;
00633             break;
00634         }
00635         case EntityName:
00636         {
00637             int ll = kMin(src.length(), 9-cBufferPos);
00638             while(ll--) {
00639                 QChar csrc = *src;
00640                 cc = csrc.cell();
00641 
00642                 if(csrc.row() || !((cc >= 'a' && cc <= 'z') ||
00643                                    (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
00644                     Entity = SearchSemicolon;
00645                     break;
00646                 }
00647 
00648                 cBuffer[cBufferPos++] = cc;
00649                 ++src;
00650             }
00651             if(cBufferPos == 9) Entity = SearchSemicolon;
00652             if(Entity == SearchSemicolon) {
00653                 if(cBufferPos > 1) {
00654                     const entity *e = findEntity(cBuffer, cBufferPos);
00655                     if(e)
00656                         EntityChar = e->code;
00657 
00658                     // be IE compatible
00659                     if(tag && EntityChar.unicode() > 255 && *src != ';')
00660                         EntityChar = QChar::null;
00661                 }
00662             }
00663             else
00664                 break;
00665         }
00666         case SearchSemicolon:
00667 
00668             //kdDebug( 6036 ) << "ENTITY " << EntityChar.unicode() << ", " << res << endl;
00669 
00670             fixUpChar(EntityChar);
00671 
00672             if ( EntityChar != QChar::null ) {
00673                 checkBuffer();
00674                 // Just insert it
00675                 if (*src == ';')
00676                     ++src;
00677 
00678                 src.push( EntityChar );
00679             } else {
00680 #ifdef TOKEN_DEBUG
00681                 kdDebug( 6036 ) << "unknown entity!" << endl;
00682 #endif
00683                 checkBuffer(10);
00684                 // ignore the sequence, add it to the buffer as plaintext
00685                 *dest++ = '&';
00686                 for(unsigned int i = 0; i < cBufferPos; i++)
00687                     dest[i] = cBuffer[i];
00688                 dest += cBufferPos;
00689                 Entity = NoEntity;
00690                 if (pre)
00691                     prePos += cBufferPos+1;
00692             }
00693 
00694             Entity = NoEntity;
00695             EntityChar = QChar::null;
00696             return;
00697         };
00698     }
00699 }
00700 
00701 void HTMLTokenizer::parseTag(DOMStringIt &src)
00702 {
00703     assert(!Entity );
00704 
00705     while ( src.length() )
00706     {
00707         checkBuffer();
00708 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00709         uint l = 0;
00710         while(l < src.length() && (*(src.current()+l)).latin1() != '>')
00711             l++;
00712         qDebug("src is now: *%s*, tquote: %d",
00713                QConstString((QChar*)src.current(), l).string().latin1(), tquote);
00714 #endif
00715         switch(tag) {
00716         case NoTag:
00717         {
00718             return;
00719         }
00720         case TagName:
00721         {
00722 #if defined(TOKEN_DEBUG) &&  TOKEN_DEBUG > 1
00723             qDebug("TagName");
00724 #endif
00725             if (searchCount > 0)
00726             {
00727                 if (*src == commentStart[searchCount])
00728                 {
00729                     searchCount++;
00730                     if (searchCount == 4)
00731                     {
00732 #ifdef TOKEN_DEBUG
00733                         kdDebug( 6036 ) << "Found comment" << endl;
00734 #endif
00735                         // Found '<!--' sequence
00736                         ++src;
00737                         dest = buffer; // ignore the previous part of this tag
00738                         tag = NoTag;
00739 
00740                         comment = true;
00741                         // push what we parsed so far upon the stack. helps for <!-->
00742                         checkScriptBuffer();
00743                         scriptCode[0] = scriptCode[1] = '-';
00744                         scriptCodeSize = 2;
00745                         parseComment(src);
00746                         return; // Finished parsing tag!
00747                     }
00748                     // cuts of high part, is okay
00749                     cBuffer[cBufferPos++] = src->cell();
00750                     ++src;
00751                     break;
00752                 }
00753                 else
00754                     searchCount = 0; // Stop looking for '<!--' sequence
00755             }
00756 
00757             bool finish = false;
00758             unsigned int ll = kMin(src.length(), CBUFLEN-cBufferPos);
00759             while(ll--) {
00760                 ushort curchar = *src;
00761                 if(curchar <= ' ' || curchar == '>' ) {
00762                     finish = true;
00763                     break;
00764                 }
00765                 // this is a nasty performance trick. will work for the A-Z
00766                 // characters, but not for others. if it contains one,
00767                 // we fail anyway
00768                 char cc = curchar;
00769                 cBuffer[cBufferPos++] = cc | 0x20;
00770                 ++src;
00771             }
00772 
00773             // Disadvantage: we add the possible rest of the tag
00774             // as attribute names. ### judge if this causes problems
00775             if(finish || CBUFLEN == cBufferPos) {
00776                 bool beginTag;
00777                 char* ptr = cBuffer;
00778                 unsigned int len = cBufferPos;
00779                 cBuffer[cBufferPos] = '\0';
00780                 if ((cBufferPos > 0) && (*ptr == '/'))
00781                 {
00782                     // End Tag
00783                     beginTag = false;
00784                     ptr++;
00785                     len--;
00786                 }
00787                 else
00788                     // Start Tag
00789                     beginTag = true;
00790                 // Accept empty xml tags like <br/>
00791                 if(len > 1 && ptr[len-1] == '/' ) {
00792                     ptr[--len] = '\0';
00793                     // if its like <br/> and not like <input/ value=foo>, take it as flat
00794                     if (*src == '>')
00795                         currToken.flat = true;
00796                 }
00797 
00798                 uint tagID = khtml::getTagID(ptr, len);
00799                 if (!tagID) {
00800 #ifdef TOKEN_DEBUG
00801                     QCString tmp(ptr, len+1);
00802                     kdDebug( 6036 ) << "Unknown tag: \"" << tmp.data() << "\"" << endl;
00803 #endif
00804                     dest = buffer;
00805                 }
00806                 else
00807                 {
00808 #ifdef TOKEN_DEBUG
00809                     QCString tmp(ptr, len+1);
00810                     kdDebug( 6036 ) << "found tag id=" << tagID << ": " << tmp.data() << endl;
00811 #endif
00812                     currToken.id = beginTag ? tagID : tagID + ID_CLOSE_TAG;
00813                     dest = buffer;
00814                 }
00815                 tag = SearchAttribute;
00816                 cBufferPos = 0;
00817             }
00818             break;
00819         }
00820         case SearchAttribute:
00821         {
00822 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00823                 qDebug("SearchAttribute");
00824 #endif
00825             bool atespace = false;
00826             ushort curchar;
00827             while(src.length()) {
00828                 curchar = *src;
00829                 if(curchar > ' ') {
00830                     if(curchar == '>')
00831                         tag = SearchEnd;
00832                     else if(atespace && (curchar == '\'' || curchar == '"'))
00833                     {
00834                         tag = SearchValue;
00835                         *dest++ = 0;
00836                         attrName = QString::null;
00837                     }
00838                     else
00839                         tag = AttributeName;
00840 
00841                     cBufferPos = 0;
00842                     break;
00843                 }
00844                 atespace = true;
00845                 ++src;
00846             }
00847             break;
00848         }
00849         case AttributeName:
00850         {
00851 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00852                 qDebug("AttributeName");
00853 #endif
00854             ushort curchar;
00855             int ll = kMin(src.length(), CBUFLEN-cBufferPos);
00856 
00857             while(ll--) {
00858                 curchar = *src;
00859                 if(curchar <= '>') {
00860                     if(curchar <= ' ' || curchar == '=' || curchar == '>') {
00861                         unsigned int a;
00862                         cBuffer[cBufferPos] = '\0';
00863                         a = khtml::getAttrID(cBuffer, cBufferPos);
00864                         if ( !a )
00865                             attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data());
00866 
00867                         dest = buffer;
00868                         *dest++ = a;
00869 #ifdef TOKEN_DEBUG
00870                         if (!a || (cBufferPos && *cBuffer == '!'))
00871                             kdDebug( 6036 ) << "Unknown attribute: *" << QCString(cBuffer, cBufferPos+1).data() << "*" << endl;
00872                         else
00873                             kdDebug( 6036 ) << "Known attribute: " << QCString(cBuffer, cBufferPos+1).data() << endl;
00874 #endif
00875                         // did we just get />
00876                         if (!a && cBufferPos == 1 && *cBuffer == '/' && curchar == '>')
00877                             currToken.flat = true;
00878 
00879                         tag = SearchEqual;
00880                         break;
00881                     }
00882                 }
00883                 cBuffer[cBufferPos++] = (char) curchar | 0x20;
00884                 ++src;
00885             }
00886             if ( cBufferPos == CBUFLEN ) {
00887                 cBuffer[cBufferPos] = '\0';
00888                 attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data());
00889                 dest = buffer;
00890                 *dest++ = 0;
00891                 tag = SearchEqual;
00892             }
00893             break;
00894         }
00895         case SearchEqual:
00896         {
00897 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00898                 qDebug("SearchEqual");
00899 #endif
00900             ushort curchar;
00901             bool atespace = false;
00902             while(src.length()) {
00903                 curchar = src->unicode();
00904                 if(curchar > ' ') {
00905                     if(curchar == '=') {
00906 #ifdef TOKEN_DEBUG
00907                         kdDebug(6036) << "found equal" << endl;
00908 #endif
00909                         tag = SearchValue;
00910                         ++src;
00911                     }
00912                     else if(atespace && (curchar == '\'' || curchar == '"'))
00913                     {
00914                         tag = SearchValue;
00915                         *dest++ = 0;
00916                         attrName = QString::null;
00917                     }
00918                     else {
00919                         DOMString v("");
00920                         currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
00921                         dest = buffer;
00922                         tag = SearchAttribute;
00923                     }
00924                     break;
00925                 }
00926                 atespace = true;
00927                 ++src;
00928             }
00929             break;
00930         }
00931         case SearchValue:
00932         {
00933             ushort curchar;
00934             while(src.length()) {
00935                 curchar = src->unicode();
00936                 if(curchar > ' ') {
00937                     if(( curchar == '\'' || curchar == '\"' )) {
00938                         tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
00939                         tag = QuotedValue;
00940                         ++src;
00941                     } else
00942                         tag = Value;
00943 
00944                     break;
00945                 }
00946                 ++src;
00947             }
00948             break;
00949         }
00950         case QuotedValue:
00951         {
00952 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00953                 qDebug("QuotedValue");
00954 #endif
00955             ushort curchar;
00956             while(src.length()) {
00957                 checkBuffer();
00958 
00959                 curchar = src->unicode();
00960                 if(curchar <= '\'' && !src.escaped()) {
00961                     // ### attributes like '&{blaa....};' are supposed to be treated as jscript.
00962                     if ( curchar == '&' )
00963                     {
00964                         ++src;
00965                         parseEntity(src, dest, true);
00966                         break;
00967                     }
00968                     else if ( (tquote == SingleQuote && curchar == '\'') ||
00969                               (tquote == DoubleQuote && curchar == '\"') )
00970                     {
00971                         // some <input type=hidden> rely on trailing spaces. argh
00972                         while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
00973                             dest--; // remove trailing newlines
00974                         DOMString v(buffer+1, dest-buffer-1);
00975                         currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
00976 
00977                         dest = buffer;
00978                         tag = SearchAttribute;
00979                         tquote = NoQuote;
00980                         ++src;
00981                         break;
00982                     }
00983                 }
00984                 *dest++ = *src;
00985                 ++src;
00986             }
00987             break;
00988         }
00989         case Value:
00990         {
00991 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00992             qDebug("Value");
00993 #endif
00994             ushort curchar;
00995             while(src.length()) {
00996                 checkBuffer();
00997                 curchar = src->unicode();
00998                 if(curchar <= '>' && !src.escaped()) {
00999                     // parse Entities
01000                     if ( curchar == '&' )
01001                     {
01002                         ++src;
01003                         parseEntity(src, dest, true);
01004                         break;
01005                     }
01006                     // no quotes. Every space means end of value
01007                     // '/' does not delimit in IE!
01008                     if ( curchar <= ' ' || curchar == '>' )
01009                     {
01010                         DOMString v(buffer+1, dest-buffer-1);
01011                         currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
01012                         dest = buffer;
01013                         tag = SearchAttribute;
01014                         break;
01015                     }
01016                 }
01017 
01018                 *dest++ = *src;
01019                 ++src;
01020             }
01021             break;
01022         }
01023         case SearchEnd:
01024         {
01025 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
01026                 qDebug("SearchEnd");
01027 #endif
01028             while(src.length()) {
01029                 if(*src == '>')
01030                     break;
01031 
01032                 if (*src == '/')
01033                     currToken.flat = true;
01034 
01035                 ++src;
01036             }
01037             if(!src.length() && *src != '>') break;
01038 
01039             searchCount = 0; // Stop looking for '<!--' sequence
01040             tag = NoTag;
01041             tquote = NoQuote;
01042             ++src;
01043 
01044             if ( !currToken.id ) //stop if tag is unknown
01045                 return;
01046 
01047             uint tagID = currToken.id;
01048 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
01049             kdDebug( 6036 ) << "appending Tag: " << tagID << endl;
01050 #endif
01051             bool beginTag = !currToken.flat && (tagID < ID_CLOSE_TAG);
01052 
01053             if(tagID >= ID_CLOSE_TAG)
01054                 tagID -= ID_CLOSE_TAG;
01055             else if ( beginTag && !brokenScript && tagID == ID_SCRIPT ) {
01056                 AttributeImpl* a = 0;
01057                 scriptSrc = scriptSrcCharset = QString::null;
01058                 if ( currToken.attrs && /* potentially have a ATTR_SRC ? */
01059                      parser->doc()->view()->part()->jScriptEnabled() && /* jscript allowed at all? */
01060                      view /* are we a regular tokenizer or just for innerHTML ? */
01061                     ) {
01062                     if ( ( a = currToken.attrs->getAttributeItem( ATTR_SRC ) ) )
01063                         scriptSrc = parser->doc()->completeURL(khtml::parseURL( a->value() ).string() );
01064                     if ( ( a = currToken.attrs->getAttributeItem( ATTR_CHARSET ) ) )
01065                         scriptSrcCharset = a->value().string().stripWhiteSpace();
01066                     if ( scriptSrcCharset.isEmpty() )
01067                         scriptSrcCharset = parser->doc()->view()->part()->encoding();
01068                     if (!(a = currToken.attrs->getAttributeItem( ATTR_LANGUAGE )))
01069                         a = currToken.attrs->getAttributeItem(ATTR_TYPE);
01070                 }
01071                 javascript = true;
01072                 if( a ) {
01073                     QString lang = a->value().string();
01074                     lang = lang.lower();
01075                     if( !lang.contains("javascript") &&
01076                         !lang.contains("ecmascript") &&
01077                         !lang.contains("livescript") &&
01078                         !lang.contains("jscript") )
01079                         javascript = false;
01080                 }
01081             }
01082 
01083             processToken();
01084 
01085             // lets see if we're still in parsing mood for spaces
01086             pre = parser->preMode();
01087 
01088             switch( tagID ) {
01089             case ID_PRE:
01090                 prePos = 0;
01091                 break;
01092             case ID_SCRIPT:
01093                 if (beginTag) {
01094                     searchStopper = scriptEnd;
01095                     searchStopperLen = 8;
01096                     script = true;
01097                     parseSpecial(src);
01098                 }
01099                 break;
01100             case ID_STYLE:
01101                 if (beginTag) {
01102                     searchStopper = styleEnd;
01103                     searchStopperLen = 7;
01104                     style = true;
01105                     parseSpecial(src);
01106                 }
01107                 break;
01108             case ID_TEXTAREA:
01109                 if(beginTag) {
01110                     searchStopper = textareaEnd;
01111                     searchStopperLen = 10;
01112                     textarea = true;
01113                     discard = AllDiscard;
01114                     parseSpecial(src);
01115                 }
01116                 break;
01117             case ID_TITLE:
01118                 if (beginTag) {
01119                     searchStopper = titleEnd;
01120                     searchStopperLen = 7;
01121                     title = true;
01122                     parseSpecial(src);
01123                 }
01124                 break;
01125             case ID_XMP:
01126                 if (beginTag) {
01127                     searchStopper = xmpEnd;
01128                     searchStopperLen = 5;
01129                     xmp = true;
01130                     parseSpecial(src);
01131                 }
01132                 break;
01133             case ID_SELECT:
01134                 select = beginTag;
01135                 break;
01136             case ID_PLAINTEXT:
01137                 plaintext = beginTag;
01138                 break;
01139             }
01140             return; // Finished parsing tag!
01141         }
01142         } // end switch
01143     }
01144     return;
01145 }
01146 
01147 void HTMLTokenizer::addPending()
01148 {
01149     if ( select && !(comment || script))
01150     {
01151         *dest++ = ' ';
01152     }
01153     else if ( textarea )
01154     {
01155         switch(pending) {
01156         case LFPending:  *dest++ = '\n'; prePos = 0; break;
01157         case SpacePending: *dest++ = ' '; ++prePos; break;
01158         case TabPending: *dest++ = '\t'; prePos += TAB_SIZE - (prePos % TAB_SIZE); break;
01159         case NonePending:
01160             assert(0);
01161         }
01162     }
01163     else if ( pre )
01164     {
01165         int p;
01166 
01167         switch (pending)
01168         {
01169         case SpacePending:
01170             // Insert a breaking space
01171             *dest++ = QChar(' ');
01172             prePos++;
01173             break;
01174 
01175         case LFPending:
01176             *dest = '\n';
01177             dest++;
01178             prePos = 0;
01179             break;
01180 
01181         case TabPending:
01182             p = TAB_SIZE - ( prePos % TAB_SIZE );
01183             for ( int x = 0; x < p; x++ )
01184                 *dest++ = QChar(' ');
01185             prePos += p;
01186             break;
01187 
01188         case NonePending:
01189             assert(0);
01190             break;
01191         }
01192     }
01193     else
01194     {
01195         *dest++ = ' ';
01196     }
01197 
01198     pending = NonePending;
01199 }
01200 
01201 void HTMLTokenizer::write( const QString &str, bool appendData )
01202 {
01203 #ifdef TOKEN_DEBUG
01204     kdDebug( 6036 ) << this << " Tokenizer::write(\"" << str << "\"," << appendData << ")" << endl;
01205 #endif
01206 
01207     if ( !buffer )
01208         return;
01209 
01210     if ( ( m_executingScript && appendData ) ||
01211          ( !m_executingScript && cachedScript.count() ) ) {
01212         // don't parse; we will do this later
01213         pendingSrc += str;
01214         return;
01215     }
01216 
01217     if ( onHold ) {
01218         QString rest = QString( src.current(), src.length() );
01219         rest += str;
01220         setSrc(rest);
01221         return;
01222     }
01223     else
01224         setSrc(str);
01225 
01226 //     if (Entity)
01227 //         parseEntity(src, dest);
01228 
01229     while ( src.length() )
01230     {
01231         // do we need to enlarge the buffer?
01232         checkBuffer();
01233 
01234         ushort cc = src->unicode();
01235 
01236         if (skipLF && (cc != '\n'))
01237             skipLF = false;
01238 
01239         if (skipLF) {
01240             skipLF = false;
01241             ++src;
01242         }
01243         else if ( Entity )
01244             parseEntity( src, dest );
01245         else if ( plaintext )
01246             parseText( src );
01247         else if (script)
01248             parseSpecial(src);
01249         else if (style)
01250             parseSpecial(src);
01251         else if (xmp)
01252             parseSpecial(src);
01253         else if (textarea)
01254             parseSpecial(src);
01255         else if (title)
01256             parseSpecial(src);
01257         else if (comment)
01258             parseComment(src);
01259         else if (server)
01260             parseServer(src);
01261         else if (processingInstruction)
01262             parseProcessingInstruction(src);
01263         else if (tag)
01264             parseTag(src);
01265         else if ( startTag )
01266         {
01267             startTag = false;
01268 
01269             switch(cc) {
01270             case '/':
01271                 break;
01272             case '!':
01273             {
01274                 // <!-- comment -->
01275                 searchCount = 1; // Look for '<!--' sequence to start comment
01276 
01277                 break;
01278             }
01279             case '?':
01280             {
01281                 // xml processing instruction
01282                 processingInstruction = true;
01283                 tquote = NoQuote;
01284                 parseProcessingInstruction(src);
01285                 continue;
01286 
01287                 break;
01288             }
01289             case '%':
01290                 if (!brokenServer) {
01291                     // <% server stuff, handle as comment %>
01292                     server = true;
01293                     tquote = NoQuote;
01294                     parseServer(src);
01295                     continue;
01296                 }
01297                 // else fall through
01298             default:
01299             {
01300                 if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z')))
01301                 {
01302                     // Start of a Start-Tag
01303                 }
01304                 else
01305                 {
01306                     // Invalid tag
01307                     // Add as is
01308                     if (pending)
01309                         addPending();
01310                     *dest = '<';
01311                     dest++;
01312                     continue;
01313                 }
01314             }
01315             }; // end case
01316 
01317             if ( pending ) {
01318                 // pre context always gets its spaces/linefeeds
01319                 if ( pre )
01320                     addPending();
01321                 // only add in existing inline context or if
01322                 // we just started one, i.e. we're about to insert real text
01323                 else if ( !parser->selectMode() &&
01324                           ( !parser->noSpaces() || dest > buffer )) {
01325                     addPending();
01326                     discard = AllDiscard;
01327                 }
01328                 // just forget it
01329                 else
01330                     pending = NonePending;
01331             }
01332 
01333             processToken();
01334 
01335             cBufferPos = 0;
01336             tag = TagName;
01337             parseTag(src);
01338         }
01339         else if ( cc == '&' && !src.escaped())
01340         {
01341             ++src;
01342             if ( pending )
01343                 addPending();
01344             parseEntity(src, dest, true);
01345         }
01346         else if ( cc == '<' && !src.escaped())
01347         {
01348             tagStartLineno = lineno+src.lineCount();
01349             ++src;
01350             startTag = true;
01351         }
01352         else if (( cc == '\n' ) || ( cc == '\r' ))
01353         {
01354             if ( pre || textarea)
01355             {
01356                 if (discard == LFDiscard || discard == AllDiscard)
01357                 {
01358                     // Ignore this LF
01359                     discard = NoneDiscard; // We have discarded 1 LF
01360                 }
01361                 else
01362                 {
01363                     // Process this LF
01364                     if (pending)
01365                         addPending();
01366                     pending = LFPending;
01367                 }
01368             }
01369             else
01370             {
01371                 if (discard == LFDiscard)
01372                 {
01373                     // Ignore this LF
01374                     discard = NoneDiscard; // We have discarded 1 LF
01375                 }
01376                 else if(discard == AllDiscard)
01377                 {
01378                 }
01379                 else
01380                 {
01381                     // Process this LF
01382                     if (pending == NonePending)
01383                         pending = LFPending;
01384                 }
01385             }
01386             /* Check for MS-DOS CRLF sequence */
01387             if (cc == '\r')
01388             {
01389                 skipLF = true;
01390             }
01391             ++src;
01392         }
01393         else if (( cc == ' ' ) || ( cc == '\t' ))
01394         {
01395             if ( pre || textarea)
01396             {
01397                 if (discard == SpaceDiscard || discard == AllDiscard)
01398                 {
01399                     // Ignore this LF
01400                     discard = NoneDiscard; // We have discarded 1 LF
01401                 }
01402                 else {
01403                     if (pending)
01404                         addPending();
01405                     if (cc == ' ')
01406                         pending = SpacePending;
01407                     else
01408                         pending = TabPending;
01409                 }
01410             }
01411             else
01412             {
01413                 if(discard == SpaceDiscard)
01414                     discard = NoneDiscard;
01415                 else if(discard == AllDiscard)
01416                 { }
01417                 else
01418                     pending = SpacePending;
01419             }
01420             ++src;
01421         }
01422         else
01423         {
01424             if (pending)
01425                 addPending();
01426 
01427             discard = NoneDiscard;
01428             if ( pre )
01429             {
01430                 prePos++;
01431             }
01432             *dest = *src;
01433             fixUpChar( *dest );
01434             ++dest;
01435             ++src;
01436         }
01437     }
01438     _src = QString::null;
01439 
01440     if (noMoreData && cachedScript.isEmpty() && !m_executingScript )
01441         end(); // this actually causes us to be deleted
01442 }
01443 
01444 void HTMLTokenizer::end()
01445 {
01446     if ( buffer == 0 ) {
01447         emit finishedParsing();
01448         return;
01449     }
01450 
01451     // parseTag is using the buffer for different matters
01452     if ( !tag )
01453         processToken();
01454 
01455     if(buffer)
01456         KHTML_DELETE_QCHAR_VEC(buffer);
01457 
01458     if(scriptCode)
01459         KHTML_DELETE_QCHAR_VEC(scriptCode);
01460 
01461     scriptCode = 0;
01462     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
01463     buffer = 0;
01464     emit finishedParsing();
01465 }
01466 
01467 void HTMLTokenizer::finish()
01468 {
01469     // do this as long as we don't find matching comment ends
01470     while((title || script || comment || server) && scriptCode && scriptCodeSize)
01471     {
01472         // we've found an unmatched comment start
01473         if (comment)
01474             brokenComments = true;
01475         else if (server)
01476             brokenServer = true;
01477         else if (script)
01478             brokenServer = true;
01479         checkScriptBuffer();
01480         scriptCode[ scriptCodeSize ] = 0;
01481         scriptCode[ scriptCodeSize + 1 ] = 0;
01482         int pos;
01483         QString food;
01484         if (title || script || style) {
01485             food.setUnicode(scriptCode, scriptCodeSize);
01486         }
01487         else if (server) {
01488             food = "<";
01489             food += QString(scriptCode, scriptCodeSize);
01490         }
01491         else {
01492             pos = QConstString(scriptCode, scriptCodeSize).string().find('>');
01493             food.setUnicode(scriptCode+pos+1, scriptCodeSize-pos-1); // deep copy
01494         }
01495         KHTML_DELETE_QCHAR_VEC(scriptCode);
01496         scriptCode = 0;
01497         scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
01498         if ( script )
01499             scriptHandler();
01500         comment = script = title = server = false;
01501         if ( !food.isEmpty() )
01502             write(food, true);
01503     }
01504     // this indicates we will not recieve any more data... but if we are waiting on
01505     // an external script to load, we can't finish parsing until that is done
01506     noMoreData = true;
01507     if (cachedScript.isEmpty() && !m_executingScript && !onHold)
01508         end(); // this actually causes us to be deleted
01509 }
01510 
01511 void HTMLTokenizer::processToken()
01512 {
01513     KJSProxy *jsProxy = view ? view->part()->jScript() : 0L;
01514     if (jsProxy)
01515         jsProxy->setEventHandlerLineno(tagStartLineno);
01516     if ( dest > buffer )
01517     {
01518 #ifdef TOKEN_DEBUG
01519         if(currToken.id) {
01520             qDebug( "unexpected token id: %d, str: *%s*", currToken.id,QConstString( buffer,dest-buffer ).string().latin1() );
01521             assert(0);
01522         }
01523 
01524 #endif
01525         currToken.text = new DOMStringImpl( buffer, dest - buffer );
01526         currToken.text->ref();
01527         currToken.id = ID_TEXT;
01528     }
01529     else if(!currToken.id) {
01530         currToken.reset();
01531         if (jsProxy)
01532             jsProxy->setEventHandlerLineno(lineno+src.lineCount());
01533         return;
01534     }
01535 
01536     dest = buffer;
01537 
01538 #ifdef TOKEN_DEBUG
01539     QString name = getTagName(currToken.id).string();
01540     QString text;
01541     if(currToken.text)
01542         text = QConstString(currToken.text->s, currToken.text->l).string();
01543 
01544     kdDebug( 6036 ) << "Token --> " << name << "   id = " << currToken.id << endl;
01545     if (currToken.flat)
01546         kdDebug( 6036 ) << "Token is FLAT!" << endl;
01547     if(!text.isNull())
01548         kdDebug( 6036 ) << "text: \"" << text << "\"" << endl;
01549     unsigned long l = currToken.attrs ? currToken.attrs->length() : 0;
01550     if(l) {
01551         kdDebug( 6036 ) << "Attributes: " << l << endl;
01552         for (unsigned long i = 0; i < l; ++i) {
01553             AttributeImpl* c = currToken.attrs->attributeItem(i);
01554             kdDebug( 6036 ) << "    " << c->id() << " " << parser->doc()->getDocument()->attrName(c->id()).string()
01555                             << "=\"" << c->value().string() << "\"" << endl;
01556         }
01557     }
01558     kdDebug( 6036 ) << endl;
01559 #endif
01560     // pass the token over to the parser, the parser DOES NOT delete the token
01561     parser->parseToken(&currToken);
01562 
01563     if ( currToken.flat && currToken.id != ID_TEXT && !parser->noSpaces() )
01564         discard = NoneDiscard;
01565     else if ( parser->selectMode() )
01566         discard = AllDiscard;
01567 
01568     currToken.reset();
01569     if (jsProxy)
01570         jsProxy->setEventHandlerLineno(0);
01571 }
01572 
01573 
01574 HTMLTokenizer::~HTMLTokenizer()
01575 {
01576     reset();
01577     delete parser;
01578 }
01579 
01580 
01581 void HTMLTokenizer::enlargeBuffer(int len)
01582 {
01583     int newsize = kMax(size*2, size+len);
01584     int oldoffs = (dest - buffer);
01585 
01586     buffer = (QChar*)realloc(buffer, newsize*sizeof(QChar));
01587     dest = buffer + oldoffs;
01588     size = newsize;
01589 }
01590 
01591 void HTMLTokenizer::enlargeScriptBuffer(int len)
01592 {
01593     int newsize = kMax(scriptCodeMaxSize*2, scriptCodeMaxSize+len);
01594     scriptCode = (QChar*)realloc(scriptCode, newsize*sizeof(QChar));
01595     scriptCodeMaxSize = newsize;
01596 }
01597 
01598 void HTMLTokenizer::notifyFinished(CachedObject* /*finishedObj*/)
01599 {
01600     assert(!cachedScript.isEmpty());
01601     bool done = false;
01602     while (!done && cachedScript.head()->isLoaded()) {
01603 #ifdef TOKEN_DEBUG
01604         kdDebug( 6036 ) << "Finished loading an external script" << endl;
01605 #endif
01606         CachedScript* cs = cachedScript.dequeue();
01607         done = cachedScript.isEmpty();
01608         DOMString scriptSource = cs->script();
01609 #ifdef TOKEN_DEBUG
01610         kdDebug( 6036 ) << "External script is:" << endl << scriptSource.string() << endl;
01611 #endif
01612         setSrc(QString::null);
01613 
01614         // make sure we forget about the script before we execute the new one
01615         // infinite recursion might happen otherwise
01616         QString cachedScriptUrl( cs->url().string() );
01617         cs->deref(this);
01618 
01619         scriptExecution( scriptSource.string(), cachedScriptUrl );
01620 
01621         // 'script' is true when we are called synchronously from
01622         // parseScript(). In that case parseScript() will take care
01623         // of 'scriptOutput'.
01624         if ( !script ) {
01625             QString rest = pendingSrc;
01626             pendingSrc = QString::null;
01627             write(rest, false);
01628             // we might be deleted at this point, do not
01629             // access any members.
01630         }
01631     }
01632 }
01633 
01634 void HTMLTokenizer::setSrc(const QString& source)
01635 {
01636     lineno += src.lineCount();
01637     _src = source;
01638     src = DOMStringIt(_src);
01639 }
01640 
01641 void HTMLTokenizer::setOnHold(bool _onHold)
01642 {
01643     if (onHold == _onHold) return;
01644     onHold = _onHold;
01645     if (onHold)
01646         setSrc(QString(src.current(), src.length())); // ### deep copy
01647 }
01648 
KDE Logo
This file is part of the documentation for kdelibs Version 3.1.5.
Documentation copyright © 1996-2002 the KDE developers.
Generated on Wed Jan 28 13:34:00 2004 by doxygen 1.3.4 written by Dimitri van Heesch, © 1997-2001