with Ada.Text_IO; with Ada.Finalization; with Ada.Unchecked_Deallocation; package Lexer is --==================================================================== -- Author Christoph Grein -- Version 3.3 -- Date 3 October 1998 --==================================================================== -- A LL(1) grammar parser for a stream of characters representing an -- Ada or Java program. -- Attention: Not much effort has been invested to analyse the stream -- for legality. -- Tokens are correctly recognized as long as the input string is -- a legal program. Otherwise some of the bad tokens might not be -- returned as such. -- -- The lexer recognizes Ada or Java files upon initialisation. If it -- cannot identify the language, the Unidentifyable_Language exception -- is raised. Token names are appended with _A, _J, or _AJ depending -- on the language they are used for. -- -- Note that reserved words and identifiers are separate lexical items -- for both, Ada (ARM 2.2(1)) and Java (JRM 3.8). -- However the Ada-reserved words Access, Delta, Digits, Range are -- also used as attribute designators (ARM 4.1.4(3,5)). The lexer -- always returns these items as reserved word tokens, because it is -- not considered the lexer's chore to do an analysis depending on the -- sequence of tokens: -- Attribute designators always follow the tick (ARM 4.1.4(2)). -- -- An (Ada) operator occurring as an operator symbol (ARM 6.1(9), e.g. -- "=" (A, B)) is always returned as a string literal ("="), never as -- the corresponding token (Equal_A). -- Such a string literal can easily be identified as an operator in -- the following cases: -- - In a selected component (ARM 4.1.3(2)), it is preceded by a dot. -- - In a subprogram specification (ARM 6.1(4)) or a function call -- (ARM 6.4(3), it is followed by an opening parenthesis. -- - In a formal_subprogram_declaration as a subprogram default -- (ARM 12.6(2)), it is preceded by the reserved word is. -- Only the occurence in a generic instantiation as an explicit -- generic actual parameter (ARM 12.3(5)) necessitates a compilation- -- like analysis to discriminate between a simple string and an -- operator symbol, which is of course out of the capabilities of -- lexical parsing. -- -- Get_Token reads the next token from the input string. End_Error is -- raised when the end of input has been reached. For each token, the -- start and end position in the input stream is stored. Furthermore, -- while the input is processed, lines and columns are counted. -- -- Reset resets the parser to the given token's position in the input -- stream. This token and the ones following will be returned on the -- next calls to Get_Token. Reset_Error is raised if called with a -- token not from the input string. --==================================================================== -- History -- Author Version Date Reason for change -- C.G. 0.0 25.05.1998 PDL -- C.G. 0.1 31.05.1998 Added bad token -- C.G. 1.0 17.06.1998 Final design (added reason of bad token) -- C.G. 2.0 01.07.1998 Make Token controlled to prevent storage -- leak -- C.G. 2.1 06.07.1998 Size of Tab -- C.G. 3.0 28.07.1998 Also lex Java -- C.G. 3.1 05.08.1998 Added Documentation_Tag -- C.G. 3.2 22.09.1998 Replace -- Ampersand_AJ by Concatenate_A, And_J -- Vertical_Bar_AJ by Alternative_A, Or_J -- C.G. 3.3 03.10.1998 Added functions Image and Tag_Pos for -- documentation tags, Is_Operator --==================================================================== type Token_Name is (-- Ada reserved words ARM 2.9 (2) -- Java keywords JRM 3.9 Abort_A, Abs_A, Abstract_AJ, Accept_A, Access_A, Aliased_A, All_A, And_A, Array_A, At_A, Begin_A, Body_A, Boolean_J, Break_J, Byte_J, Case_AJ, Catch_J, Char_J, Class_J, Const_J, Constant_A, Continue_J, Declare_A, Default_J, Delay_A, Delta_A, Digits_A, Do_AJ, Double_J, Else_AJ, Elsif_A, End_A, Entry_A, Exception_A, Exit_A, Extends_J, Final_J, Finally_J, Float_J, For_AJ, Function_A, Generic_A, Goto_AJ, If_AJ, Implements_J, Import_J, In_A, InstanceOf_J, Int_J, Interface_J, Is_A, Limited_A, Long_J, Loop_A, Mod_A, Native_J, New_AJ, Not_A, Null_A, Of_A, Or_A, Others_A, Out_A, Package_AJ, Pragma_A, Private_AJ, Procedure_A, Protected_AJ, Public_J, Raise_A, Range_A, Record_A, Rem_A, Renames_A, Requeue_A, Return_AJ, Reverse_A, Select_A, Separate_A, Short_J, Static_J, Subtype_A, Super_J, Switch_J, Synchronized_J, Tagged_A, Task_A, Terminate_A, Then_A, This_J, Throw_J, Throws_J, Transient_J, Try_J, Type_A, Until_A, Use_A, Void_J, Volatile_J, When_A, While_AJ, With_A, Xor_A, -- Ada delimiters ARM 2.2 (9) -- & ' ( ) * + , - . / : ; < = > | -- Ada compound delimiters ARM 2.2 (14) -- => .. ** := /= >= <= << >> <> -- Java separators JRM 3.11 -- ( ) { } [ ] ; , . -- Java operators JRM 3.12 -- = > < ! ~ ? : -- == <= >= != && || ++ -- -- + - * / & | ^ % << >> >>> -- += -= *= /= &= |= ^= %= <<= >>= >>>= Colon_AJ, Comma_AJ, Dot_AJ, Semicolon_AJ, Tick_A, -- : , . ; ' LeftBrace_J, RightBrace_J, -- { } LeftBracket_J, RightBracket_J, -- [ ] Left_Parenthesis_AJ, Right_Parenthesis_AJ, -- ( ) Concatenate_A, And_J, -- & Alternative_A, Or_J, -- | Assignment_J, Conditional_J, -- = ? Equal_A, Greater_Than_AJ, Less_Than_AJ, -- = > < Complement_J, Not_J, Xor_J, -- ~ ! ^ Plus_AJ, Minus_AJ, Times_AJ, Divide_AJ, Remainder_J, -- + - * / % Arrow_A, Assignment_A, Double_Dot_A, Exponentiate_A, -- => := .. ** Equal_J, Not_Equal_A, NotEqual_J, -- == /= != Greater_Equal_AJ, Less_Equal_AJ, -- >= <= Left_Label_Bracket_A, Right_Label_Bracket_A, Box_A, -- << >> <> Increment_J, Decrement_J, -- ++ -- LeftShift_J, RightShift_J, UnsignedRightShift_J, -- << >> >>> ShortCutAnd_J, ShortCutOr_J, -- && || PlusAssign_J, MinusAssign_J, -- += -= TimesAssign_J, DivideAssign_J, RemainderAssign_J, -- *= /= %= AndAssign_J, OrAssign_J, XorAssign_J, -- &= |= ^= LeftShiftAssign_J, RightShiftAssign_J, -- <<= >>= UnsignedRightShiftAssign_J, -- >>>= -- Ada (ARM 2.4 .. 2.6) and Java (JRM 3.10) literals (all Java reals -- may use lazy forms, i.e. whole or decimal part may be missing) Null_J, False_J, True_J, Integer_AJ, -- 1, Ada 1E+10 Based_Integer_AJ, -- 13#C#E+10, Java 07 (octal) 0xF (hexadecimal) LongInteger_J, -- 1L BasedLongInteger_J, -- 07L, 0xFL Real_AJ, -- 1.0E+10, Java 1E+10 Based_Real_A, -- Ada 13#C.B#E+5 FloatNumber_J, -- 1.0E+10F DoubleNumber_J, -- 1.0E+10D Character_AJ, String_AJ, -- Ada and Java other tokens Identifier_AJ, Comment_A, -- -- to end of line EndOfLineComment_J, -- // to end of line TraditionalComment_J, -- /* on one line */ CommentHead_J, -- /* to end of line CommentBody_J, -- anything in between CommentTail_J, -- to */ Documentation_J, -- /** on one line */ DocumentationHead_J, -- /** to end of line DocumentationBody_J, -- anything in between DocumentationTail_J, -- to */ -- Ada and Java bad token (syntax error) Bad_Token_AJ); subtype Reserved_Word is Token_Name range Abort_A .. Xor_A; subtype Delimiter is Token_Name range Colon_AJ .. UnsignedRightShiftAssign_J; subtype Literal is Token_Name range Null_J .. String_AJ; subtype Number is Literal range Integer_AJ .. DoubleNumber_J; subtype Whole_Number is Number range Integer_AJ .. BasedLongInteger_J; subtype Real_Number is Number range Real_AJ .. DoubleNumber_J; subtype Comment is Token_Name range Comment_A .. DocumentationTail_J; subtype Documentation is Comment range Documentation_J .. DocumentationTail_J; type Token is private; function Name (Item: Token) return Token_Name; function Image (Item: Token) return String; pragma Inline (Name); -- Operators cannot be made a subtype of Token_Name. Thus a function -- is specified. -- Ada: ARM 4.5(2) and or xor -- (3) = /= < <= > >= -- (4,5) + - & -- (6) * / mod rem -- (7) ** abs not -- (not in operator symbol form) -- Java: JRM 3.12 (see above) function Is_Operator (Item: Token) return Boolean; -- Only for based numbers. subtype Number_Base is Ada.Text_IO.Number_Base; function Base (Item: Token) return Number_Base; pragma Inline (Base); -- Only for Java documentation comments. type Documentation_Tag is (No_Tag, See_Tag, Author_Tag, Version_Tag, Param_Tag, Return_Tag, Exception_Tag); subtype Proper_Tag is Documentation_Tag range See_Tag .. Documentation_Tag'Last; function Tag (Item: Token) return Documentation_Tag; function Tag_Pos (Item: Token) return Natural; -- position within image pragma Inline (Tag, Tag_Pos); function Image (Tag: Proper_Tag) return String; -- Only for bad tokens (if there are errors, it depends on the context -- when the bad token's end is assumed and which error is reported). -- Illegal_Literal is used for any error in identifiers or numbers that -- is not covered by more explicit reports. type Token_Error is (Non_Language_Character, -- e.g. { for Ada, ` for Java Illegal_Literal, -- e.g. non-graphic character Illegal_Underline, -- for identifiers and numbers Illegal_Base, -- for numbers Missing_Base_Quote, Illegal_Extended_Digit, Missing_String_Quote); -- string terminated by EOL function Error (Item: Token) return Token_Error; pragma Inline (Error); -- Token position within string and file function First (Item: Token) return Positive; function Last (Item: Token) return Positive; function Line (Item: Token) return Positive; function Col (Item: Token) return Positive; pragma Inline (First, Last, Line, Col); -- Input string --------------------------------------------------------- type String_Pointer is access all String; procedure Free is new Ada.Unchecked_Deallocation (String, String_Pointer); procedure Initialize (Input: in String_Pointer; Size_of_Tab: in Positive); Unidentifyable_Language: exception; -- Current position within Input function Line return Positive; function Col return Positive; pragma Inline (Line, Col); function End_of_Input return Boolean; pragma Inline (End_of_Input); -- Token parser --------------------------------------------------------- function Get_Token return Token; procedure Reset (to_Token: in Token); End_Error, Reset_Error: exception; private type Token_Core (Name: Token_Name := Comment_A) is record -- location of token in input string and file Line , Col, First, Last: Positive; case Name is when Number'First .. Bad_Token_AJ => Image: String_Pointer; -- illegal token if null case Name is when Based_Real_A | Based_Integer_AJ | BasedLongInteger_J => Base : Number_Base; when Documentation => Tag : Documentation_Tag; Pos : Natural; when Bad_Token_AJ => Error: Token_Error; when others => null; end case; when others => null; end case; end record; type Token is new Ada.Finalization.Controlled with record Core: Token_Core; end record; procedure Adjust (Object: in out Token); procedure Finalize (Object: in out Token); end Lexer;