View Javadoc

1   package org.codehaus.groovy.syntax.lexer;
2   
3   import org.codehaus.groovy.syntax.ReadException;
4   import org.codehaus.groovy.syntax.Numbers;
5   import org.codehaus.groovy.syntax.Types;
6   import org.codehaus.groovy.syntax.Token;
7   
8   /***
9    *  The core code used in lexing Groovy.
10   *
11   *  @author Bob Mcwhirter
12   *  @author James Strachan
13   *  @author John Wilson
14   *  @author Chris Poirier
15   */
16  
17  public class GroovyLexerBase extends LexerBase
18  {
19  
20      protected StringLexer  stringLexer  = new StringLexer();   // support lexer for processing strings
21      protected GStringLexer gstringLexer = new GStringLexer();  // support lexer for processing GStrings
22  
23  
24     /***
25      *  Finds and returns (and consumes) the next token from the underlying stream.
26      *  Returns null when out of tokens.
27      */
28  
29      public Token nextToken() throws ReadException, LexerException
30      {
31          // System.out.println( "entering GroovyLexerBase.nextToken() on " + this );
32  
33          Token token = null;
34          OUTER_LOOP : while (token == null)
35          {
36  
37              //
38              // Get from the delegate, if available
39  
40              if( delegate != null )
41              {
42                  token = delegate.nextToken();
43  
44                  if( token == null )
45                  {
46                      undelegate();
47                  }
48                  else
49                  {
50                      break OUTER_LOOP;
51                  }
52              }
53  
54  
55              //
56              // Otherwise, do it the hard way.
57  
58              char c = la();
59  
60              ROOT_SWITCH : switch (c)
61              {
62                  case (CharStream.EOS) :
63                  {
64                      break OUTER_LOOP;
65                  }
66                  case (' ') :
67                  case ('\t') :
68                  {
69                      consume();
70                      token = null;
71                      break ROOT_SWITCH;
72                  }
73                  case ('\r') :
74                  case ('\n') :
75                  {
76                      mark();
77                      token = tokenizeEOL();
78                      break ROOT_SWITCH;
79                  }
80                  case ('{') :
81                  {
82                      mark();
83                      consume();
84                      token = symbol( Types.LEFT_CURLY_BRACE );
85                      break ROOT_SWITCH;
86                  }
87                  case ('}') :
88                  {
89                      mark();
90                      consume();
91                      token = symbol( Types.RIGHT_CURLY_BRACE );
92                      break ROOT_SWITCH;
93                  }
94                  case ('[') :
95                  {
96                      mark();
97                      consume();
98                      token = symbol( Types.LEFT_SQUARE_BRACKET );
99                      break ROOT_SWITCH;
100                 }
101                 case (']') :
102                 {
103                     mark();
104                     consume();
105                     token = symbol( Types.RIGHT_SQUARE_BRACKET );
106                     break ROOT_SWITCH;
107                 }
108                 case ('(') :
109                 {
110                     mark();
111                     consume();
112                     token = symbol( Types.LEFT_PARENTHESIS );
113                     break ROOT_SWITCH;
114                 }
115                 case (')') :
116                 {
117                     mark();
118                     consume();
119                     token = symbol( Types.RIGHT_PARENTHESIS );
120                     break ROOT_SWITCH;
121                 }
122                 case ('#') :
123                 {
124                     consume();
125 
126                     token = symbol( Types.NEWLINE, -1 );
127 
128                     CONSUME_LOOP : while( true )
129                     {
130                         switch (c = la())
131                         {
132                             case ('\r') :
133                             case ('\n') :
134                             {
135                                 readEOL();
136                                 break CONSUME_LOOP;
137                             }
138                             case CharStream.EOS :
139                             {
140                                 break CONSUME_LOOP;
141                             }
142                             default :
143                             {
144                                 consume();
145                             }
146                         }
147                     }
148                     break ROOT_SWITCH;
149                 }
150                 case ('/') :
151                 {
152                     mark();
153                     consume();
154 
155                     c = la();
156 
157                     MULTICHAR_SWITCH : switch (c)
158                     {
159                         case ('=') :
160                         {
161                             consume();
162                             token = symbol( Types.DIVIDE_EQUAL );
163                             break MULTICHAR_SWITCH;
164                         }
165                         case ('/') :
166                         {
167                             consume();
168                             token = symbol( Types.NEWLINE, -2 );
169 
170                             CONSUME_LOOP : while (true)
171                             {
172                                 switch (c = la())
173                                 {
174                                     case ('\r') :
175                                     case ('\n') :
176                                     {
177                                         readEOL();
178                                         break CONSUME_LOOP;
179                                     }
180                                     case CharStream.EOS :
181                                     {
182                                         break CONSUME_LOOP;
183                                     }
184                                     default :
185                                     {
186                                         consume();
187                                     }
188                                 }
189                             }
190                             break MULTICHAR_SWITCH;
191                         }
192                         case ('*') :
193                         {
194                             CONSUME_LOOP : while (true)
195                             {
196                                 CONSUME_SWITCH : switch (c = la())
197                                 {
198                                     case ('*') :
199                                     {
200                                         consume();
201                                         if (la() == '/')
202                                         {
203                                             consume();
204                                             break CONSUME_LOOP;
205                                         }
206                                         break CONSUME_SWITCH;
207                                     }
208                                     case ('\r') :
209                                     case ('\n') :
210                                     {
211                                         readEOL();
212                                         break CONSUME_SWITCH;
213                                     }
214                                     case CharStream.EOS :
215                                     {
216                                         break CONSUME_LOOP;
217                                     }
218                                     default :
219                                     {
220                                         consume();
221                                     }
222                                 }
223                             }
224                             token = null;
225                             break MULTICHAR_SWITCH;
226                         }
227                         default :
228                         {
229                             token = symbol( Types.DIVIDE );
230                             break MULTICHAR_SWITCH;
231                         }
232                     }
233                     break ROOT_SWITCH;
234                 }
235                 case ('%') :
236                 {
237                     mark();
238                     consume();
239 
240                     c = la();
241 
242                     MULTICHAR_SWITCH : switch (c)
243                     {
244                         case ('=') :
245                         {
246                             consume();
247                             token = symbol( Types.MOD_EQUAL );
248                             break MULTICHAR_SWITCH;
249                         }
250                         default :
251                         {
252                             token = symbol( Types.MOD );
253                             break MULTICHAR_SWITCH;
254                         }
255                     }
256                     break ROOT_SWITCH;
257                 }
258                 case ('//') :
259                 {
260                     mark();
261                     consume();
262 
263                     c = la();
264 
265                     MULTICHAR_SWITCH : switch (c)
266                     {
267                         case ('=') :
268                         {
269                             consume();
270                             token = symbol( Types.INTDIV_EQUAL );
271                             break MULTICHAR_SWITCH;
272                         }
273                         default :
274                         {
275                             token = symbol( Types.INTDIV );
276                             break MULTICHAR_SWITCH;
277                         }
278                     }
279                     break ROOT_SWITCH;
280                 }
281                 case ('~') :
282                 {
283                     mark();
284                     consume();
285 
286                     token = symbol( Types.REGEX_PATTERN );
287                     break ROOT_SWITCH;
288                 }
289                 case ('!') :
290                 {
291                     mark();
292                     consume();
293 
294                     c = la();
295 
296                     MULTICHAR_SWITCH : switch (c)
297                     {
298                         case ('=') :
299                         {
300                             consume();
301                             if( la() == '=' )
302                             {
303                                 consume();
304                                 token = symbol( Types.COMPARE_NOT_IDENTICAL );
305                             }
306                             else
307                             {
308                                 token = symbol( Types.COMPARE_NOT_EQUAL );
309                             }
310                             break MULTICHAR_SWITCH;
311                         }
312                         default :
313                         {
314                             token = symbol( Types.NOT );
315                             break MULTICHAR_SWITCH;
316                         }
317                     }
318                     break ROOT_SWITCH;
319                 }
320                 case ('=') :
321                 {
322                     mark();
323                     consume();
324 
325                     c = la();
326 
327                     MULTICHAR_SWITCH : switch (c)
328                     {
329                         case ('=') :
330                         {
331                             consume();
332                             c = la();
333 
334                             switch (c)
335                             {
336                                 case '=' :
337                                 {
338                                     consume();
339                                     token = symbol( Types.COMPARE_IDENTICAL );
340                                     break;
341                                 }
342                                 case '~' :
343                                 {
344                                     consume();
345                                     token = symbol( Types.MATCH_REGEX );
346                                     break;
347                                 }
348                                 default :
349                                 {
350                                     token = symbol( Types.COMPARE_EQUAL );
351                                 }
352                             }
353                             break MULTICHAR_SWITCH;
354                         }
355                         case '~' :
356                         {
357                             consume();
358                             token = symbol( Types.FIND_REGEX );
359                             break MULTICHAR_SWITCH;
360                         }
361                         default :
362                         {
363                             token = symbol( Types.EQUAL );
364                             break MULTICHAR_SWITCH;
365                         }
366                     }
367                     break ROOT_SWITCH;
368                 }
369                 case ('&') :
370                 {
371                     mark();
372                     consume();
373 
374                     c = la();
375 
376                     MULTICHAR_SWITCH : switch (c)
377                     {
378                         case ('&') :
379                         {
380                             consume();
381 
382                             if( la() == '=' )
383                             {
384                                 consume();
385                                 token = symbol( Types.LOGICAL_AND_EQUAL );
386                             }
387                             else
388                             {
389                                 token = symbol( Types.LOGICAL_AND );
390                             }
391 
392                             break MULTICHAR_SWITCH;
393                         }
394                         default :
395                         {
396                             unexpected( c, new char[] { '&' }, 1 );
397                         }
398                     }
399                     break ROOT_SWITCH;
400                 }
401                 case ('|') :
402                 {
403                     mark();
404                     consume();
405                     c = la();
406 
407                     MULTICHAR_SWITCH : switch (c)
408                     {
409                         case ('|') :
410                         {
411                             consume();
412 
413                             if( la() == '=' )
414                             {
415                                 consume();
416                                 token = symbol( Types.LOGICAL_OR_EQUAL );
417                             }
418                             else
419                             {
420                                 token = symbol( Types.LOGICAL_OR );
421                             }
422 
423                             break MULTICHAR_SWITCH;
424                         }
425                         default :
426                         {
427                             token = symbol( Types.PIPE );
428                             break MULTICHAR_SWITCH;
429                         }
430                     }
431                     break ROOT_SWITCH;
432                 }
433                 case ('+') :
434                 {
435                     mark();
436                     consume();
437 
438                     c = la();
439 
440                     MULTICHAR_SWITCH : switch (c)
441                     {
442                         case ('+') :
443                         {
444                             consume();
445                             token = symbol( Types.PLUS_PLUS );
446                             break MULTICHAR_SWITCH;
447                         }
448                         case ('=') :
449                         {
450                             consume();
451                             token = symbol( Types.PLUS_EQUAL );
452                             break MULTICHAR_SWITCH;
453                         }
454                         default :
455                         {
456                             token = symbol( Types.PLUS );
457                             break MULTICHAR_SWITCH;
458                         }
459                     }
460                     break ROOT_SWITCH;
461                 }
462                 case ('-') :
463                 {
464                     mark();
465                     consume();
466 
467                     c = la();
468 
469                     MULTICHAR_SWITCH : switch (c)
470                     {
471                         case ('-') :
472                         {
473                             consume();
474                             token = symbol( Types.MINUS_MINUS );
475                             break MULTICHAR_SWITCH;
476                         }
477                         case ('=') :
478                         {
479                             consume();
480                             token = symbol( Types.MINUS_EQUAL );
481                             break MULTICHAR_SWITCH;
482                         }
483                         case ('>') :
484                         {
485                             consume();
486                             token = symbol( Types.NAVIGATE );
487                             break MULTICHAR_SWITCH;
488                         }
489                         default :
490                         {
491                             token = symbol( Types.MINUS );
492                             break MULTICHAR_SWITCH;
493                         }
494                     }
495                     break ROOT_SWITCH;
496                 }
497                 case ('*') :
498                 {
499                     mark();
500                     consume();
501 
502                     c = la();
503 
504                     MULTICHAR_SWITCH : switch (c)
505                     {
506                         case ('=') :
507                         {
508                             consume();
509                             token = symbol( Types.MULTIPLY_EQUAL );
510                             break MULTICHAR_SWITCH;
511                         }
512                         default :
513                         {
514                             token = symbol( Types.MULTIPLY );
515                             break MULTICHAR_SWITCH;
516                         }
517                     }
518                     break ROOT_SWITCH;
519                 }
520                 case (':') :
521                 {
522                     mark();
523                     consume();
524 
525                     token = symbol( Types.COLON );
526                     break ROOT_SWITCH;
527                 }
528                 case (',') :
529                 {
530                     mark();
531                     consume();
532                     token = symbol( Types.COMMA );
533                     break ROOT_SWITCH;
534                 }
535                 case (';') :
536                 {
537                     mark();
538                     consume();
539                     token = symbol( Types.SEMICOLON );
540                     break ROOT_SWITCH;
541                 }
542                 case ('?') :
543                 {
544                     mark();
545                     consume();
546                     token = symbol( Types.QUESTION );
547                     break ROOT_SWITCH;
548                 }
549                 case ('<') :
550                 {
551                     mark();
552                     consume();
553 
554                     c = la();
555 
556                     MULTICHAR_SWITCH : switch (c)
557                     {
558                         case ('=') :
559                         {
560                             consume();
561                             c = la();
562                             if (c == '>')
563                             {
564                                 consume();
565                                 token = symbol( Types.COMPARE_TO );
566                             }
567                             else
568                             {
569                                 token = symbol( Types.COMPARE_LESS_THAN_EQUAL );
570                             }
571                             break MULTICHAR_SWITCH;
572                         }
573                         case ('<') :
574                         {
575                             consume();
576                             c = la();
577 
578                             //
579                             // It's a "here-doc", created using <<<TOK ... \nTOK.   The terminator
580                             // runs from the <<< to the end of the line.  The marker is then used
581                             // to create a HereDocLexer which becomes our delegate until the heredoc
582                             // is finished.
583 
584                             if (c == '<')
585                             {
586                                 consume();
587 
588                                 StringBuffer marker = new StringBuffer();
589                                 while( (c = la()) != '\n' && c != '\r' && c != CharStream.EOS )
590                                 {
591                                     marker.append( consume() );
592                                 }
593 
594                                 readEOL();
595 
596                                 Lexer child = new HereDocLexer( marker.toString() );
597                                 delegate( child );
598 
599                                 gstringLexer.reset();
600                                 child.delegate( gstringLexer );
601 
602                                 break ROOT_SWITCH;
603                             }
604                             else
605                             {
606                                 token = symbol( Types.LEFT_SHIFT );
607                                 break ROOT_SWITCH;
608                             }
609                         }
610                         default :
611                         {
612                             token = symbol( Types.COMPARE_LESS_THAN );
613                             break MULTICHAR_SWITCH;
614                         }
615                     }
616                     break ROOT_SWITCH;
617                 }
618                 case ('>') :
619                 {
620                     mark();
621                     consume();
622 
623                     c = la();
624 
625                     MULTICHAR_SWITCH : switch (c)
626                     {
627                         case ('=') :
628                         {
629                             consume();
630                             token = symbol( Types.COMPARE_GREATER_THAN_EQUAL );
631                             break MULTICHAR_SWITCH;
632                         }
633                         case ('>') :
634                         {
635                             consume();
636                             if( la() == '>' )
637                             {
638                                 consume();
639                                 token = symbol( Types.RIGHT_SHIFT_UNSIGNED );
640                             } 
641                             else
642                             {	
643                             	token = symbol( Types.RIGHT_SHIFT );
644                             }
645                             break MULTICHAR_SWITCH;
646                         }
647                         default :
648                         {
649                             token = symbol( Types.COMPARE_GREATER_THAN );
650                             break MULTICHAR_SWITCH;
651                         }
652                     }
653                     break ROOT_SWITCH;
654                 }
655                 case ('\'') :
656                 {
657                     mark();
658 
659                     stringLexer.reset();
660                     stringLexer.allowGStrings(false);
661                     delegate( stringLexer );
662 
663                     break ROOT_SWITCH;
664                 }
665                 case ('"') :
666                 {
667                     mark();
668 
669                     stringLexer.reset();
670                     stringLexer.allowGStrings(true);
671                     delegate( stringLexer );
672 
673                     gstringLexer.reset();
674                     stringLexer.delegate( gstringLexer );
675 
676                     break ROOT_SWITCH;
677                 }
678                 case ('0') :
679                 case ('1') :
680                 case ('2') :
681                 case ('3') :
682                 case ('4') :
683                 case ('5') :
684                 case ('6') :
685                 case ('7') :
686                 case ('8') :
687                 case ('9') :
688                 case ('.') :
689                 {
690                     mark();
691 
692                     //
693                     // If it is a '.' and not followed by a digit,
694                     // it's an operator.
695 
696                     if( c == '.' && !Numbers.isDigit(la(2)) )
697                     {
698                         consume();
699                         if( la() == '.' )
700                         {
701                             consume();
702                             if( la() == '.' )
703                             {
704                                 consume();
705                                 token = symbol( Types.DOT_DOT_DOT );
706                             }
707                             else
708                             {
709                                 token = symbol( Types.DOT_DOT );
710                             }
711                         }
712                         else
713                         {
714                             token = symbol( Types.DOT );
715                         }
716                         break ROOT_SWITCH;
717                     }
718 
719 
720                     //
721                     // Otherwise, we are processing a number (integer or decimal).
722 
723                     StringBuffer numericLiteral = new StringBuffer();
724                     boolean      isDecimal      = false;
725 
726 
727                     //
728                     // If it starts 0 and isn't a decimal number, we give
729                     // special handling for hexadecimal or octal notation.
730 
731                     char c2 = la(2);
732                     if( c == '0' && (c2 == 'X' || c2 == 'x' || Numbers.isDigit(c2)) )
733                     {
734                         numericLiteral.append( consume() );
735 
736                         if( (c = la()) == 'X' || c == 'x' )
737                         {
738                             numericLiteral.append( consume() );
739                             if( Numbers.isHexDigit(la()) )
740                             {
741                                 while( Numbers.isHexDigit(la()) )
742                                 {
743                                     numericLiteral.append( consume() );
744                                 }
745                             }
746                             else
747                             {
748                                 unexpected( la(), numericLiteral.length(), "expected hexadecimal digit" );
749                             }
750                         }
751                         else
752                         {
753                             while( Numbers.isOctalDigit(la()) )
754                             {
755                                 numericLiteral.append( consume() );
756                             }
757 
758                             if( Numbers.isDigit(la()) )
759                             {
760                                 unexpected( la(), numericLiteral.length(), "expected octal digit" );
761                             }
762                         }
763                     }
764 
765 
766                     //
767                     // Otherwise, it's in base 10, integer or decimal.
768 
769                     else
770                     {
771                         while( Numbers.isDigit(la()) )
772                         {
773                             numericLiteral.append( consume() );
774                         }
775 
776 
777                         //
778                         // Next, check for a decimal point
779 
780                         if( la() == '.' && Numbers.isDigit(la(2)) )
781                         {
782                             isDecimal = true;
783 
784                             numericLiteral.append( consume() );
785                             while( Numbers.isDigit(la()) )
786                             {
787                                 numericLiteral.append( consume() );
788                             }
789 
790                             //
791                             // Check for an exponent
792 
793                             if( (c = la()) == 'e' || c == 'E' )
794                             {
795                                 numericLiteral.append( consume() );
796 
797                                 if (la() == '+' || la() == '-')
798                                 {
799                                     numericLiteral.append(consume());
800                                 }
801 
802                                 if( Numbers.isDigit(la()) )
803                                 {
804                                     while( Numbers.isDigit(la()) )
805                                     {
806                                         numericLiteral.append( consume() );
807                                     }
808                                 }
809                                 else
810                                 {
811                                     unexpected( la(), numericLiteral.length(), "expected exponent" );
812                                 }
813                             }
814                         }
815                     }
816 
817 
818                     //
819                     // If there is a type suffix, include it.
820 
821                     if( Numbers.isNumericTypeSpecifier(la(), isDecimal) )
822                     {
823                         numericLiteral.append( consume() );
824                     }
825 
826 
827                     //
828                     // For good error reporting, make sure there is nothing invalid next.
829 
830                     if( Character.isJavaIdentifierPart(c = la()) )
831                     {
832                         unexpected( c, numericLiteral.length(), "expected end of numeric literal" );
833                     }
834 
835 
836                     //
837                     // Finally, create the token.
838 
839                     if( isDecimal )
840                     {
841                         token = Token.newDecimal( numericLiteral.toString(), getStartLine(), getStartColumn() );
842                     }
843                     else
844                     {
845                         token = Token.newInteger( numericLiteral.toString(), getStartLine(), getStartColumn() );
846                     }
847 
848                     break ROOT_SWITCH;
849                 }
850                 default :
851                 {
852                     mark();
853                     if (Character.isJavaIdentifierStart(c))
854                     {
855                         StringBuffer identifier = new StringBuffer();
856 
857                         IDENTIFIER_LOOP : while (true)
858                         {
859                             c = la();
860 
861                             if (Character.isJavaIdentifierPart(c))
862                             {
863                                 identifier.append(consume());
864                             }
865                             else
866                             {
867                                 break IDENTIFIER_LOOP;
868                             }
869                         }
870 
871                         String text = identifier.toString();
872                         token = Token.newKeyword( text, getStartLine(), getStartColumn() );
873 
874                         if (token == null)
875                         {
876                             token = Token.newIdentifier( text, getStartLine(), getStartColumn() );
877                         }
878                     }
879                     else
880                     {
881                         unexpected( c, 1 );
882                     }
883 
884                     break ROOT_SWITCH;
885                 }
886             }
887         }
888 
889         // System.out.println( "" + this + ".nextToken() returning [" + token + "]" );
890 
891         return token;
892     }
893 
894 }