You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
236 lines
5.4 KiB
D
236 lines
5.4 KiB
D
/**
|
|
* Lexer interface definition
|
|
*/
|
|
module tlang.compiler.lexer.core.lexer;
|
|
|
|
import tlang.compiler.lexer.core.tokens : Token;
|
|
import std.ascii : isDigit, isAlpha, isWhite;
|
|
|
|
/**
|
|
* Defines the interface a lexer must provide
|
|
* such that is can be used to source tokens
|
|
* from in the parser
|
|
*/
|
|
public interface LexerInterface
|
|
{
|
|
/**
|
|
* Returns the token at the current cursor
|
|
* position
|
|
*
|
|
* Returns: the `Token`
|
|
*/
|
|
public Token getCurrentToken();
|
|
|
|
/**
|
|
* Moves the cursor one token forward
|
|
*/
|
|
public void nextToken();
|
|
|
|
/**
|
|
* Moves the cursor one token backwards
|
|
*/
|
|
public void previousToken();
|
|
|
|
/**
|
|
* Sets the position of the cursor
|
|
*
|
|
* Params:
|
|
* cursor = the new position
|
|
*/
|
|
public void setCursor(ulong cursor);
|
|
|
|
/**
|
|
* Retrieves the cursor's current position
|
|
*
|
|
* Returns: the position
|
|
*/
|
|
public ulong getCursor();
|
|
|
|
/**
|
|
* Checks whether more tokens are available
|
|
* of not
|
|
*
|
|
* Returns: true if more tokens are available, false otherwise
|
|
*/
|
|
public bool hasTokens();
|
|
|
|
/**
|
|
* Get the line position of the lexer in the source text
|
|
*
|
|
* Returns: the position
|
|
*/
|
|
public ulong getLine();
|
|
|
|
/**
|
|
* Get the column position of the lexer in the source text
|
|
*
|
|
* Returns: the position
|
|
*/
|
|
public ulong getColumn();
|
|
|
|
/**
|
|
* Exhaustively provide a list of all tokens
|
|
*
|
|
* Returns: a `Token[]` containing all tokens
|
|
*/
|
|
public Token[] getTokens();
|
|
}
|
|
|
|
/**
|
|
* Human-readable names assigned
|
|
* to commonly used character
|
|
* constants
|
|
*/
|
|
public enum LexerSymbols : char
|
|
{
|
|
L_PAREN = '(',
|
|
R_PAREN = ')',
|
|
SEMI_COLON = ';',
|
|
COMMA = ',',
|
|
L_BRACK = '[' ,
|
|
R_BRACK = ']' ,
|
|
PLUS = '+' ,
|
|
MINUS = '-' ,
|
|
FORWARD_SLASH = '/' ,
|
|
PERCENT = '%' ,
|
|
STAR = '*' ,
|
|
AMPERSAND = '&' ,
|
|
L_BRACE = '{' ,
|
|
R_BRACE = '}' ,
|
|
EQUALS = '=' ,
|
|
SHEFFER_STROKE = '|' ,
|
|
CARET = '^' ,
|
|
EXCLAMATION = '!' ,
|
|
TILDE = '~' ,
|
|
DOT = '.' ,
|
|
COLON = ':',
|
|
SPACE = ' ',
|
|
TAB = '\t',
|
|
NEWLINE = '\n',
|
|
DOUBLE_QUOTE = '"',
|
|
SINGLE_QUOTE = '\'' ,
|
|
BACKSLASH = '\\' ,
|
|
UNDERSCORE = '_' ,
|
|
LESS_THAN = '<' ,
|
|
BIGGER_THAN = '>' ,
|
|
|
|
ESC_NOTHING = '0' ,
|
|
ESC_CARRIAGE_RETURN = 'r' ,
|
|
ESC_TAB = 't' ,
|
|
ESC_NEWLINE = 'n' ,
|
|
ESC_BELL= 'a' ,
|
|
|
|
ENC_BYTE = 'B' ,
|
|
ENC_INT = 'I' ,
|
|
ENC_LONG = 'L' ,
|
|
ENC_WORD = 'W' ,
|
|
ENC_UNSIGNED = 'U' ,
|
|
ENC_SIGNED = 'S' ,
|
|
}
|
|
|
|
/**
|
|
* Alias to `LexerSymbols`
|
|
*/
|
|
public alias LS = LexerSymbols;
|
|
|
|
/**
|
|
* Checks if the provided character is an operator
|
|
*
|
|
* Params:
|
|
* c = the character to check
|
|
* Returns: `true` if it is a character, `false`
|
|
* otherwise
|
|
*/
|
|
public bool isOperator(char c)
|
|
{
|
|
return c == LS.PLUS || c == LS.TILDE || c == LS.MINUS ||
|
|
c == LS.STAR || c == LS.FORWARD_SLASH || c == LS.AMPERSAND ||
|
|
c == LS.CARET || c == LS.EXCLAMATION || c == LS.SHEFFER_STROKE ||
|
|
c == LS.LESS_THAN || c == LS.BIGGER_THAN;
|
|
}
|
|
|
|
/**
|
|
* Checks if the provided character is a splitter
|
|
*
|
|
* Params:
|
|
* c = the character to check
|
|
* Returns: `true` if it is a splitter, `false`
|
|
* otherwise
|
|
*/
|
|
public bool isSplitter(char c)
|
|
{
|
|
return c == LS.SEMI_COLON || c == LS.COMMA || c == LS.L_PAREN ||
|
|
c == LS.R_PAREN || c == LS.L_BRACK || c == LS.R_BRACK ||
|
|
c == LS.PERCENT || c == LS.L_BRACE || c == LS.R_BRACE ||
|
|
c == LS.EQUALS || c == LS.DOT || c == LS.COLON ||
|
|
isOperator(c) || isWhite(c);
|
|
}
|
|
|
|
/**
|
|
* Checks if the provided character is a
|
|
* numerical size encoder
|
|
*
|
|
* Params:
|
|
* character = the character to check
|
|
* Returns: `true` if so, `false` otheriwse
|
|
*/
|
|
public bool isNumericalEncoder_Size(char character)
|
|
{
|
|
return character == LS.ENC_BYTE || character == LS.ENC_WORD ||
|
|
character == LS.ENC_INT || character == LS.ENC_LONG;
|
|
}
|
|
|
|
/**
|
|
* Checks if the provided character is a
|
|
* numerical signage encoder
|
|
*
|
|
* Params:
|
|
* character = the character to check
|
|
* Returns: `true` if so, `false` otherwise
|
|
*/
|
|
public bool isNumericalEncoder_Signage(char character)
|
|
{
|
|
return character == LS.ENC_SIGNED || character == LS.ENC_UNSIGNED;
|
|
}
|
|
|
|
/**
|
|
* Checks if the provided character is
|
|
* either a numerical size encoder
|
|
* or signage encoder
|
|
*
|
|
* Params:
|
|
* character = the character to check
|
|
* Returns: `true` if so, `false` otherwise
|
|
*/
|
|
public bool isNumericalEncoder(char character)
|
|
{
|
|
return isNumericalEncoder_Size(character) ||
|
|
isNumericalEncoder_Signage(character);
|
|
}
|
|
|
|
/**
|
|
* Checks if the given character is a valid
|
|
* escape character (something which would
|
|
* have followed a `\`)
|
|
*
|
|
* Params:
|
|
* character = the character to check
|
|
* Returns: `true` if so, `false` otherwise
|
|
*/
|
|
public bool isValidEscape_String(char character)
|
|
{
|
|
return character == LS.BACKSLASH || character == LS.DOUBLE_QUOTE || character == LS.SINGLE_QUOTE ||
|
|
character == LS.ESC_NOTHING || character == LS.ESC_NEWLINE || character == LS.ESC_CARRIAGE_RETURN ||
|
|
character == LS.TAB || character == LS.ESC_BELL;
|
|
}
|
|
|
|
/**
|
|
* Given a character return whether it is valid entry
|
|
* for preceding a '.'.
|
|
*
|
|
* Returns: `true` if so, otherwise `false`
|
|
*/
|
|
public bool isValidDotPrecede(char character)
|
|
{
|
|
return character == LS.R_PAREN || character == LS.R_BRACK; // || isAlpha(character) || isDigit(character);
|
|
} |