with Ada.Text_IO;
with Ada.Finalization;
with Ada.Unchecked_Deallocation;
package Lexer is
--====================================================================
-- Author Christoph Grein
-- Version 3.3
-- Date 3 October 1998
--====================================================================
-- A LL(1) grammar parser for a stream of characters representing an
-- Ada or Java program.
-- Attention: Not much effort has been invested to analyse the stream
-- for legality.
-- Tokens are correctly recognized as long as the input string is
-- a legal program. Otherwise some of the bad tokens might not be
-- returned as such.
--
-- The lexer recognizes Ada or Java files upon initialisation. If it
-- cannot identify the language, the Unidentifyable_Language exception
-- is raised. Token names are appended with _A, _J, or _AJ depending
-- on the language they are used for.
--
-- Note that reserved words and identifiers are separate lexical items
-- for both, Ada (ARM 2.2(1)) and Java (JRM 3.8).
-- However the Ada-reserved words Access, Delta, Digits, Range are
-- also used as attribute designators (ARM 4.1.4(3,5)). The lexer
-- always returns these items as reserved word tokens, because it is
-- not considered the lexer's chore to do an analysis depending on the
-- sequence of tokens:
-- Attribute designators always follow the tick (ARM 4.1.4(2)).
--
-- An (Ada) operator occurring as an operator symbol (ARM 6.1(9), e.g.
-- "=" (A, B)) is always returned as a string literal ("="), never as
-- the corresponding token (Equal_A).
-- Such a string literal can easily be identified as an operator in
-- the following cases:
-- - In a selected component (ARM 4.1.3(2)), it is preceded by a dot.
-- - In a subprogram specification (ARM 6.1(4)) or a function call
-- (ARM 6.4(3), it is followed by an opening parenthesis.
-- - In a formal_subprogram_declaration as a subprogram default
-- (ARM 12.6(2)), it is preceded by the reserved word is.
-- Only the occurence in a generic instantiation as an explicit
-- generic actual parameter (ARM 12.3(5)) necessitates a compilation-
-- like analysis to discriminate between a simple string and an
-- operator symbol, which is of course out of the capabilities of
-- lexical parsing.
--
-- Get_Token reads the next token from the input string. End_Error is
-- raised when the end of input has been reached. For each token, the
-- start and end position in the input stream is stored. Furthermore,
-- while the input is processed, lines and columns are counted.
--
-- Reset resets the parser to the given token's position in the input
-- stream. This token and the ones following will be returned on the
-- next calls to Get_Token. Reset_Error is raised if called with a
-- token not from the input string.
--====================================================================
-- History
-- Author Version Date Reason for change
-- C.G. 0.0 25.05.1998 PDL
-- C.G. 0.1 31.05.1998 Added bad token
-- C.G. 1.0 17.06.1998 Final design (added reason of bad token)
-- C.G. 2.0 01.07.1998 Make Token controlled to prevent storage
-- leak
-- C.G. 2.1 06.07.1998 Size of Tab
-- C.G. 3.0 28.07.1998 Also lex Java
-- C.G. 3.1 05.08.1998 Added Documentation_Tag
-- C.G. 3.2 22.09.1998 Replace
-- Ampersand_AJ by Concatenate_A, And_J
-- Vertical_Bar_AJ by Alternative_A, Or_J
-- C.G. 3.3 03.10.1998 Added functions Image and Tag_Pos for
-- documentation tags, Is_Operator
--====================================================================
type Token_Name is
(-- Ada reserved words ARM 2.9 (2)
-- Java keywords JRM 3.9
Abort_A, Abs_A, Abstract_AJ, Accept_A, Access_A, Aliased_A, All_A, And_A, Array_A, At_A,
Begin_A, Body_A, Boolean_J, Break_J, Byte_J,
Case_AJ, Catch_J, Char_J, Class_J, Const_J, Constant_A, Continue_J,
Declare_A, Default_J, Delay_A, Delta_A, Digits_A, Do_AJ, Double_J,
Else_AJ, Elsif_A, End_A, Entry_A, Exception_A, Exit_A, Extends_J,
Final_J, Finally_J, Float_J, For_AJ, Function_A,
Generic_A, Goto_AJ,
If_AJ, Implements_J, Import_J, In_A, InstanceOf_J, Int_J, Interface_J, Is_A,
Limited_A, Long_J, Loop_A,
Mod_A,
Native_J, New_AJ, Not_A, Null_A,
Of_A, Or_A, Others_A, Out_A,
Package_AJ, Pragma_A, Private_AJ, Procedure_A, Protected_AJ, Public_J,
Raise_A, Range_A, Record_A, Rem_A, Renames_A, Requeue_A, Return_AJ, Reverse_A,
Select_A, Separate_A, Short_J, Static_J, Subtype_A, Super_J, Switch_J, Synchronized_J,
Tagged_A, Task_A, Terminate_A, Then_A, This_J, Throw_J, Throws_J, Transient_J, Try_J, Type_A,
Until_A, Use_A,
Void_J, Volatile_J,
When_A, While_AJ, With_A,
Xor_A,
-- Ada delimiters ARM 2.2 (9)
-- & ' ( ) * + , - . / : ; < = > |
-- Ada compound delimiters ARM 2.2 (14)
-- => .. ** := /= >= <= << >> <>
-- Java separators JRM 3.11
-- ( ) { } [ ] ; , .
-- Java operators JRM 3.12
-- = > < ! ~ ? :
-- == <= >= != && || ++ --
-- + - * / & | ^ % << >> >>>
-- += -= *= /= &= |= ^= %= <<= >>= >>>=
Colon_AJ, Comma_AJ, Dot_AJ, Semicolon_AJ, Tick_A, -- : , . ; '
LeftBrace_J, RightBrace_J, -- { }
LeftBracket_J, RightBracket_J, -- [ ]
Left_Parenthesis_AJ, Right_Parenthesis_AJ, -- ( )
Concatenate_A, And_J, -- &
Alternative_A, Or_J, -- |
Assignment_J, Conditional_J, -- = ?
Equal_A, Greater_Than_AJ, Less_Than_AJ, -- = > <
Complement_J, Not_J, Xor_J, -- ~ ! ^
Plus_AJ, Minus_AJ, Times_AJ, Divide_AJ, Remainder_J, -- + - * / %
Arrow_A, Assignment_A, Double_Dot_A, Exponentiate_A, -- => := .. **
Equal_J, Not_Equal_A, NotEqual_J, -- == /= !=
Greater_Equal_AJ, Less_Equal_AJ, -- >= <=
Left_Label_Bracket_A, Right_Label_Bracket_A, Box_A, -- << >> <>
Increment_J, Decrement_J, -- ++ --
LeftShift_J, RightShift_J, UnsignedRightShift_J, -- << >> >>>
ShortCutAnd_J, ShortCutOr_J, -- && ||
PlusAssign_J, MinusAssign_J, -- += -=
TimesAssign_J, DivideAssign_J, RemainderAssign_J, -- *= /= %=
AndAssign_J, OrAssign_J, XorAssign_J, -- &= |= ^=
LeftShiftAssign_J, RightShiftAssign_J, -- <<= >>=
UnsignedRightShiftAssign_J, -- >>>=
-- Ada (ARM 2.4 .. 2.6) and Java (JRM 3.10) literals (all Java reals
-- may use lazy forms, i.e. whole or decimal part may be missing)
Null_J, False_J, True_J,
Integer_AJ, -- 1, Ada 1E+10
Based_Integer_AJ, -- 13#C#E+10, Java 07 (octal) 0xF (hexadecimal)
LongInteger_J, -- 1L
BasedLongInteger_J, -- 07L, 0xFL
Real_AJ, -- 1.0E+10, Java 1E+10
Based_Real_A, -- Ada 13#C.B#E+5
FloatNumber_J, -- 1.0E+10F
DoubleNumber_J, -- 1.0E+10D
Character_AJ, String_AJ,
-- Ada and Java other tokens
Identifier_AJ,
Comment_A, -- -- to end of line
EndOfLineComment_J, -- // to end of line
TraditionalComment_J, -- /* on one line */
CommentHead_J, -- /* to end of line
CommentBody_J, -- anything in between
CommentTail_J, -- to */
Documentation_J, -- /** on one line */
DocumentationHead_J, -- /** to end of line
DocumentationBody_J, -- anything in between
DocumentationTail_J, -- to */
-- Ada and Java bad token (syntax error)
Bad_Token_AJ);
subtype Reserved_Word is Token_Name range Abort_A .. Xor_A;
subtype Delimiter is Token_Name range Colon_AJ .. UnsignedRightShiftAssign_J;
subtype Literal is Token_Name range Null_J .. String_AJ;
subtype Number is Literal range Integer_AJ .. DoubleNumber_J;
subtype Whole_Number is Number range Integer_AJ .. BasedLongInteger_J;
subtype Real_Number is Number range Real_AJ .. DoubleNumber_J;
subtype Comment is Token_Name range Comment_A .. DocumentationTail_J;
subtype Documentation is Comment range Documentation_J .. DocumentationTail_J;
type Token is private;
function Name (Item: Token) return Token_Name;
function Image (Item: Token) return String;
pragma Inline (Name);
-- Operators cannot be made a subtype of Token_Name. Thus a function
-- is specified.
-- Ada: ARM 4.5(2) and or xor
-- (3) = /= < <= > >=
-- (4,5) + - &
-- (6) * / mod rem
-- (7) ** abs not
-- (not in operator symbol form)
-- Java: JRM 3.12 (see above)
function Is_Operator (Item: Token) return Boolean;
-- Only for based numbers.
subtype Number_Base is Ada.Text_IO.Number_Base;
function Base (Item: Token) return Number_Base;
pragma Inline (Base);
-- Only for Java documentation comments.
type Documentation_Tag is
(No_Tag, See_Tag, Author_Tag, Version_Tag, Param_Tag, Return_Tag, Exception_Tag);
subtype Proper_Tag is Documentation_Tag range See_Tag .. Documentation_Tag'Last;
function Tag (Item: Token) return Documentation_Tag;
function Tag_Pos (Item: Token) return Natural; -- position within image
pragma Inline (Tag, Tag_Pos);
function Image (Tag: Proper_Tag) return String;
-- Only for bad tokens (if there are errors, it depends on the context
-- when the bad token's end is assumed and which error is reported).
-- Illegal_Literal is used for any error in identifiers or numbers that
-- is not covered by more explicit reports.
type Token_Error is (Non_Language_Character, -- e.g. { for Ada, ` for Java
Illegal_Literal, -- e.g. non-graphic character
Illegal_Underline, -- for identifiers and numbers
Illegal_Base, -- for numbers
Missing_Base_Quote,
Illegal_Extended_Digit,
Missing_String_Quote); -- string terminated by EOL
function Error (Item: Token) return Token_Error;
pragma Inline (Error);
-- Token position within string and file
function First (Item: Token) return Positive;
function Last (Item: Token) return Positive;
function Line (Item: Token) return Positive;
function Col (Item: Token) return Positive;
pragma Inline (First, Last, Line, Col);
-- Input string ---------------------------------------------------------
type String_Pointer is access all String;
procedure Free is new Ada.Unchecked_Deallocation (String, String_Pointer);
procedure Initialize (Input: in String_Pointer; Size_of_Tab: in Positive);
Unidentifyable_Language: exception;
-- Current position within Input
function Line return Positive;
function Col return Positive;
pragma Inline (Line, Col);
function End_of_Input return Boolean;
pragma Inline (End_of_Input);
-- Token parser ---------------------------------------------------------
function Get_Token return Token;
procedure Reset (to_Token: in Token);
End_Error, Reset_Error: exception;
private
type Token_Core (Name: Token_Name := Comment_A) is record
-- location of token in input string and file
Line , Col,
First, Last: Positive;
case Name is
when Number'First .. Bad_Token_AJ =>
Image: String_Pointer; -- illegal token if null
case Name is
when Based_Real_A |
Based_Integer_AJ |
BasedLongInteger_J => Base : Number_Base;
when Documentation => Tag : Documentation_Tag;
Pos : Natural;
when Bad_Token_AJ => Error: Token_Error;
when others => null;
end case;
when others =>
null;
end case;
end record;
type Token is new Ada.Finalization.Controlled with record
Core: Token_Core;
end record;
procedure Adjust (Object: in out Token);
procedure Finalize (Object: in out Token);
end Lexer;