1 package org.codehaus.groovy.syntax.lexer;
2
3 import org.codehaus.groovy.syntax.ReadException;
4 import org.codehaus.groovy.syntax.Numbers;
5 import org.codehaus.groovy.syntax.Types;
6 import org.codehaus.groovy.syntax.Token;
7
8 /***
9 * The core code used in lexing Groovy.
10 *
11 * @author Bob Mcwhirter
12 * @author James Strachan
13 * @author John Wilson
14 * @author Chris Poirier
15 */
16
17 public class GroovyLexerBase extends LexerBase
18 {
19
20 protected StringLexer stringLexer = new StringLexer();
21 protected GStringLexer gstringLexer = new GStringLexer();
22
23
24 /***
25 * Finds and returns (and consumes) the next token from the underlying stream.
26 * Returns null when out of tokens.
27 */
28
29 public Token nextToken() throws ReadException, LexerException
30 {
31
32
33 Token token = null;
34 OUTER_LOOP : while (token == null)
35 {
36
37
38
39
40 if( delegate != null )
41 {
42 token = delegate.nextToken();
43
44 if( token == null )
45 {
46 undelegate();
47 }
48 else
49 {
50 break OUTER_LOOP;
51 }
52 }
53
54
55
56
57
58 char c = la();
59
60 ROOT_SWITCH : switch (c)
61 {
62 case (CharStream.EOS) :
63 {
64 break OUTER_LOOP;
65 }
66 case (' ') :
67 case ('\t') :
68 {
69 consume();
70 token = null;
71 break ROOT_SWITCH;
72 }
73 case ('\r') :
74 case ('\n') :
75 {
76 mark();
77 token = tokenizeEOL();
78 break ROOT_SWITCH;
79 }
80 case ('{') :
81 {
82 mark();
83 consume();
84 token = symbol( Types.LEFT_CURLY_BRACE );
85 break ROOT_SWITCH;
86 }
87 case ('}') :
88 {
89 mark();
90 consume();
91 token = symbol( Types.RIGHT_CURLY_BRACE );
92 break ROOT_SWITCH;
93 }
94 case ('[') :
95 {
96 mark();
97 consume();
98 token = symbol( Types.LEFT_SQUARE_BRACKET );
99 break ROOT_SWITCH;
100 }
101 case (']') :
102 {
103 mark();
104 consume();
105 token = symbol( Types.RIGHT_SQUARE_BRACKET );
106 break ROOT_SWITCH;
107 }
108 case ('(') :
109 {
110 mark();
111 consume();
112 token = symbol( Types.LEFT_PARENTHESIS );
113 break ROOT_SWITCH;
114 }
115 case (')') :
116 {
117 mark();
118 consume();
119 token = symbol( Types.RIGHT_PARENTHESIS );
120 break ROOT_SWITCH;
121 }
122 case ('#') :
123 {
124 consume();
125
126 token = symbol( Types.NEWLINE, -1 );
127
128 CONSUME_LOOP : while( true )
129 {
130 switch (c = la())
131 {
132 case ('\r') :
133 case ('\n') :
134 {
135 readEOL();
136 break CONSUME_LOOP;
137 }
138 case CharStream.EOS :
139 {
140 break CONSUME_LOOP;
141 }
142 default :
143 {
144 consume();
145 }
146 }
147 }
148 break ROOT_SWITCH;
149 }
150 case ('/') :
151 {
152 mark();
153 consume();
154
155 c = la();
156
157 MULTICHAR_SWITCH : switch (c)
158 {
159 case ('=') :
160 {
161 consume();
162 token = symbol( Types.DIVIDE_EQUAL );
163 break MULTICHAR_SWITCH;
164 }
165 case ('/') :
166 {
167 consume();
168 token = symbol( Types.NEWLINE, -2 );
169
170 CONSUME_LOOP : while (true)
171 {
172 switch (c = la())
173 {
174 case ('\r') :
175 case ('\n') :
176 {
177 readEOL();
178 break CONSUME_LOOP;
179 }
180 case CharStream.EOS :
181 {
182 break CONSUME_LOOP;
183 }
184 default :
185 {
186 consume();
187 }
188 }
189 }
190 break MULTICHAR_SWITCH;
191 }
192 case ('*') :
193 {
194 CONSUME_LOOP : while (true)
195 {
196 CONSUME_SWITCH : switch (c = la())
197 {
198 case ('*') :
199 {
200 consume();
201 if (la() == '/')
202 {
203 consume();
204 break CONSUME_LOOP;
205 }
206 break CONSUME_SWITCH;
207 }
208 case ('\r') :
209 case ('\n') :
210 {
211 readEOL();
212 break CONSUME_SWITCH;
213 }
214 case CharStream.EOS :
215 {
216 break CONSUME_LOOP;
217 }
218 default :
219 {
220 consume();
221 }
222 }
223 }
224 token = null;
225 break MULTICHAR_SWITCH;
226 }
227 default :
228 {
229 token = symbol( Types.DIVIDE );
230 break MULTICHAR_SWITCH;
231 }
232 }
233 break ROOT_SWITCH;
234 }
235 case ('%') :
236 {
237 mark();
238 consume();
239
240 c = la();
241
242 MULTICHAR_SWITCH : switch (c)
243 {
244 case ('=') :
245 {
246 consume();
247 token = symbol( Types.MOD_EQUAL );
248 break MULTICHAR_SWITCH;
249 }
250 default :
251 {
252 token = symbol( Types.MOD );
253 break MULTICHAR_SWITCH;
254 }
255 }
256 break ROOT_SWITCH;
257 }
258 case ('//') :
259 {
260 mark();
261 consume();
262
263 c = la();
264
265 MULTICHAR_SWITCH : switch (c)
266 {
267 case ('=') :
268 {
269 consume();
270 token = symbol( Types.INTDIV_EQUAL );
271 break MULTICHAR_SWITCH;
272 }
273 default :
274 {
275 token = symbol( Types.INTDIV );
276 break MULTICHAR_SWITCH;
277 }
278 }
279 break ROOT_SWITCH;
280 }
281 case ('~') :
282 {
283 mark();
284 consume();
285
286 token = symbol( Types.REGEX_PATTERN );
287 break ROOT_SWITCH;
288 }
289 case ('!') :
290 {
291 mark();
292 consume();
293
294 c = la();
295
296 MULTICHAR_SWITCH : switch (c)
297 {
298 case ('=') :
299 {
300 consume();
301 if( la() == '=' )
302 {
303 consume();
304 token = symbol( Types.COMPARE_NOT_IDENTICAL );
305 }
306 else
307 {
308 token = symbol( Types.COMPARE_NOT_EQUAL );
309 }
310 break MULTICHAR_SWITCH;
311 }
312 default :
313 {
314 token = symbol( Types.NOT );
315 break MULTICHAR_SWITCH;
316 }
317 }
318 break ROOT_SWITCH;
319 }
320 case ('=') :
321 {
322 mark();
323 consume();
324
325 c = la();
326
327 MULTICHAR_SWITCH : switch (c)
328 {
329 case ('=') :
330 {
331 consume();
332 c = la();
333
334 switch (c)
335 {
336 case '=' :
337 {
338 consume();
339 token = symbol( Types.COMPARE_IDENTICAL );
340 break;
341 }
342 case '~' :
343 {
344 consume();
345 token = symbol( Types.MATCH_REGEX );
346 break;
347 }
348 default :
349 {
350 token = symbol( Types.COMPARE_EQUAL );
351 }
352 }
353 break MULTICHAR_SWITCH;
354 }
355 case '~' :
356 {
357 consume();
358 token = symbol( Types.FIND_REGEX );
359 break MULTICHAR_SWITCH;
360 }
361 default :
362 {
363 token = symbol( Types.EQUAL );
364 break MULTICHAR_SWITCH;
365 }
366 }
367 break ROOT_SWITCH;
368 }
369 case ('&') :
370 {
371 mark();
372 consume();
373
374 c = la();
375
376 MULTICHAR_SWITCH : switch (c)
377 {
378 case ('&') :
379 {
380 consume();
381
382 if( la() == '=' )
383 {
384 consume();
385 token = symbol( Types.LOGICAL_AND_EQUAL );
386 }
387 else
388 {
389 token = symbol( Types.LOGICAL_AND );
390 }
391
392 break MULTICHAR_SWITCH;
393 }
394 default :
395 {
396 unexpected( c, new char[] { '&' }, 1 );
397 }
398 }
399 break ROOT_SWITCH;
400 }
401 case ('|') :
402 {
403 mark();
404 consume();
405 c = la();
406
407 MULTICHAR_SWITCH : switch (c)
408 {
409 case ('|') :
410 {
411 consume();
412
413 if( la() == '=' )
414 {
415 consume();
416 token = symbol( Types.LOGICAL_OR_EQUAL );
417 }
418 else
419 {
420 token = symbol( Types.LOGICAL_OR );
421 }
422
423 break MULTICHAR_SWITCH;
424 }
425 default :
426 {
427 token = symbol( Types.PIPE );
428 break MULTICHAR_SWITCH;
429 }
430 }
431 break ROOT_SWITCH;
432 }
433 case ('+') :
434 {
435 mark();
436 consume();
437
438 c = la();
439
440 MULTICHAR_SWITCH : switch (c)
441 {
442 case ('+') :
443 {
444 consume();
445 token = symbol( Types.PLUS_PLUS );
446 break MULTICHAR_SWITCH;
447 }
448 case ('=') :
449 {
450 consume();
451 token = symbol( Types.PLUS_EQUAL );
452 break MULTICHAR_SWITCH;
453 }
454 default :
455 {
456 token = symbol( Types.PLUS );
457 break MULTICHAR_SWITCH;
458 }
459 }
460 break ROOT_SWITCH;
461 }
462 case ('-') :
463 {
464 mark();
465 consume();
466
467 c = la();
468
469 MULTICHAR_SWITCH : switch (c)
470 {
471 case ('-') :
472 {
473 consume();
474 token = symbol( Types.MINUS_MINUS );
475 break MULTICHAR_SWITCH;
476 }
477 case ('=') :
478 {
479 consume();
480 token = symbol( Types.MINUS_EQUAL );
481 break MULTICHAR_SWITCH;
482 }
483 case ('>') :
484 {
485 consume();
486 token = symbol( Types.NAVIGATE );
487 break MULTICHAR_SWITCH;
488 }
489 default :
490 {
491 token = symbol( Types.MINUS );
492 break MULTICHAR_SWITCH;
493 }
494 }
495 break ROOT_SWITCH;
496 }
497 case ('*') :
498 {
499 mark();
500 consume();
501
502 c = la();
503
504 MULTICHAR_SWITCH : switch (c)
505 {
506 case ('=') :
507 {
508 consume();
509 token = symbol( Types.MULTIPLY_EQUAL );
510 break MULTICHAR_SWITCH;
511 }
512 default :
513 {
514 token = symbol( Types.MULTIPLY );
515 break MULTICHAR_SWITCH;
516 }
517 }
518 break ROOT_SWITCH;
519 }
520 case (':') :
521 {
522 mark();
523 consume();
524
525 token = symbol( Types.COLON );
526 break ROOT_SWITCH;
527 }
528 case (',') :
529 {
530 mark();
531 consume();
532 token = symbol( Types.COMMA );
533 break ROOT_SWITCH;
534 }
535 case (';') :
536 {
537 mark();
538 consume();
539 token = symbol( Types.SEMICOLON );
540 break ROOT_SWITCH;
541 }
542 case ('?') :
543 {
544 mark();
545 consume();
546 token = symbol( Types.QUESTION );
547 break ROOT_SWITCH;
548 }
549 case ('<') :
550 {
551 mark();
552 consume();
553
554 c = la();
555
556 MULTICHAR_SWITCH : switch (c)
557 {
558 case ('=') :
559 {
560 consume();
561 c = la();
562 if (c == '>')
563 {
564 consume();
565 token = symbol( Types.COMPARE_TO );
566 }
567 else
568 {
569 token = symbol( Types.COMPARE_LESS_THAN_EQUAL );
570 }
571 break MULTICHAR_SWITCH;
572 }
573 case ('<') :
574 {
575 consume();
576 c = la();
577
578
579
580
581
582
583
584 if (c == '<')
585 {
586 consume();
587
588 StringBuffer marker = new StringBuffer();
589 while( (c = la()) != '\n' && c != '\r' && c != CharStream.EOS )
590 {
591 marker.append( consume() );
592 }
593
594 readEOL();
595
596 Lexer child = new HereDocLexer( marker.toString() );
597 delegate( child );
598
599 gstringLexer.reset();
600 child.delegate( gstringLexer );
601
602 break ROOT_SWITCH;
603 }
604 else
605 {
606 token = symbol( Types.LEFT_SHIFT );
607 break ROOT_SWITCH;
608 }
609 }
610 default :
611 {
612 token = symbol( Types.COMPARE_LESS_THAN );
613 break MULTICHAR_SWITCH;
614 }
615 }
616 break ROOT_SWITCH;
617 }
618 case ('>') :
619 {
620 mark();
621 consume();
622
623 c = la();
624
625 MULTICHAR_SWITCH : switch (c)
626 {
627 case ('=') :
628 {
629 consume();
630 token = symbol( Types.COMPARE_GREATER_THAN_EQUAL );
631 break MULTICHAR_SWITCH;
632 }
633 case ('>') :
634 {
635 consume();
636 if( la() == '>' )
637 {
638 consume();
639 token = symbol( Types.RIGHT_SHIFT_UNSIGNED );
640 }
641 else
642 {
643 token = symbol( Types.RIGHT_SHIFT );
644 }
645 break MULTICHAR_SWITCH;
646 }
647 default :
648 {
649 token = symbol( Types.COMPARE_GREATER_THAN );
650 break MULTICHAR_SWITCH;
651 }
652 }
653 break ROOT_SWITCH;
654 }
655 case ('\'') :
656 {
657 mark();
658
659 stringLexer.reset();
660 stringLexer.allowGStrings(false);
661 delegate( stringLexer );
662
663 break ROOT_SWITCH;
664 }
665 case ('"') :
666 {
667 mark();
668
669 stringLexer.reset();
670 stringLexer.allowGStrings(true);
671 delegate( stringLexer );
672
673 gstringLexer.reset();
674 stringLexer.delegate( gstringLexer );
675
676 break ROOT_SWITCH;
677 }
678 case ('0') :
679 case ('1') :
680 case ('2') :
681 case ('3') :
682 case ('4') :
683 case ('5') :
684 case ('6') :
685 case ('7') :
686 case ('8') :
687 case ('9') :
688 case ('.') :
689 {
690 mark();
691
692
693
694
695
696 if( c == '.' && !Numbers.isDigit(la(2)) )
697 {
698 consume();
699 if( la() == '.' )
700 {
701 consume();
702 if( la() == '.' )
703 {
704 consume();
705 token = symbol( Types.DOT_DOT_DOT );
706 }
707 else
708 {
709 token = symbol( Types.DOT_DOT );
710 }
711 }
712 else
713 {
714 token = symbol( Types.DOT );
715 }
716 break ROOT_SWITCH;
717 }
718
719
720
721
722
723 StringBuffer numericLiteral = new StringBuffer();
724 boolean isDecimal = false;
725
726
727
728
729
730
731 char c2 = la(2);
732 if( c == '0' && (c2 == 'X' || c2 == 'x' || Numbers.isDigit(c2)) )
733 {
734 numericLiteral.append( consume() );
735
736 if( (c = la()) == 'X' || c == 'x' )
737 {
738 numericLiteral.append( consume() );
739 if( Numbers.isHexDigit(la()) )
740 {
741 while( Numbers.isHexDigit(la()) )
742 {
743 numericLiteral.append( consume() );
744 }
745 }
746 else
747 {
748 unexpected( la(), numericLiteral.length(), "expected hexadecimal digit" );
749 }
750 }
751 else
752 {
753 while( Numbers.isOctalDigit(la()) )
754 {
755 numericLiteral.append( consume() );
756 }
757
758 if( Numbers.isDigit(la()) )
759 {
760 unexpected( la(), numericLiteral.length(), "expected octal digit" );
761 }
762 }
763 }
764
765
766
767
768
769 else
770 {
771 while( Numbers.isDigit(la()) )
772 {
773 numericLiteral.append( consume() );
774 }
775
776
777
778
779
780 if( la() == '.' && Numbers.isDigit(la(2)) )
781 {
782 isDecimal = true;
783
784 numericLiteral.append( consume() );
785 while( Numbers.isDigit(la()) )
786 {
787 numericLiteral.append( consume() );
788 }
789
790
791
792
793 if( (c = la()) == 'e' || c == 'E' )
794 {
795 numericLiteral.append( consume() );
796
797 if (la() == '+' || la() == '-')
798 {
799 numericLiteral.append(consume());
800 }
801
802 if( Numbers.isDigit(la()) )
803 {
804 while( Numbers.isDigit(la()) )
805 {
806 numericLiteral.append( consume() );
807 }
808 }
809 else
810 {
811 unexpected( la(), numericLiteral.length(), "expected exponent" );
812 }
813 }
814 }
815 }
816
817
818
819
820
821 if( Numbers.isNumericTypeSpecifier(la(), isDecimal) )
822 {
823 numericLiteral.append( consume() );
824 }
825
826
827
828
829
830 if( Character.isJavaIdentifierPart(c = la()) )
831 {
832 unexpected( c, numericLiteral.length(), "expected end of numeric literal" );
833 }
834
835
836
837
838
839 if( isDecimal )
840 {
841 token = Token.newDecimal( numericLiteral.toString(), getStartLine(), getStartColumn() );
842 }
843 else
844 {
845 token = Token.newInteger( numericLiteral.toString(), getStartLine(), getStartColumn() );
846 }
847
848 break ROOT_SWITCH;
849 }
850 default :
851 {
852 mark();
853 if (Character.isJavaIdentifierStart(c))
854 {
855 StringBuffer identifier = new StringBuffer();
856
857 IDENTIFIER_LOOP : while (true)
858 {
859 c = la();
860
861 if (Character.isJavaIdentifierPart(c))
862 {
863 identifier.append(consume());
864 }
865 else
866 {
867 break IDENTIFIER_LOOP;
868 }
869 }
870
871 String text = identifier.toString();
872 token = Token.newKeyword( text, getStartLine(), getStartColumn() );
873
874 if (token == null)
875 {
876 token = Token.newIdentifier( text, getStartLine(), getStartColumn() );
877 }
878 }
879 else
880 {
881 unexpected( c, 1 );
882 }
883
884 break ROOT_SWITCH;
885 }
886 }
887 }
888
889
890
891 return token;
892 }
893
894 }