tlang/source/tlang/compiler/lexer/kinds/basic.d

/**
 * A single-pass tokenizer
 */
module tlang.compiler.lexer.kinds.basic;

import std.container.slist;
import gogga;
import std.conv : to;
import std.ascii : isDigit;
import tlang.compiler.lexer.core;

/**
 * Represents a basic lexer which performs the whole tokenization
 * process in one short via a call to `performLex()`, only after
 * this may the `LexerInterface` methods, such as `getCurrentToken()`,
 * `nextToken()` and so forth, actually be used.
 *
 * This is effectively a single pass lexer.
 */
public final class BasicLexer : LexerInterface
{
    /**
     * Post-perform lex() data
     *
     * This exports the LexerInterface API.
     *
     * To-do, ensure these can only be used AFTER `performLex()`
     * has been called.
     */
    private ulong tokenPtr = 0;

    /**
     * Returns the token at the current cursor
     * position
     *
     * Returns: the `Token`
     */
    public override Token getCurrentToken()
    {
        /* TODO: Throw an exception here when we try get more than we can */
        return tokens[tokenPtr];
    }

    /**
     * Moves the cursor one token forward
     */
    public override void nextToken()
    {
        tokenPtr++;
    }

    /**
     * Moves the cursor one token backwards
     */
    public override void previousToken()
    {
        tokenPtr--;
    }

    /**
     * Sets the position of the cursor
     *
     * Params:
     *   newPosition = the new position
     */
    public override void setCursor(ulong newPosition)
    {
        tokenPtr = newPosition;
    }

    /**
     * Retrieves the cursor's current position
     *
     * Returns: the position
     */
    public override ulong getCursor()
    {
        return tokenPtr;
    }

    /**
     * Checks whether more tokens are available
     * of not
     *
     * Returns: true if more tokens are available, false otherwise
     */
    public override bool hasTokens()
    {
        return tokenPtr < tokens.length;
    }

    /**
     * Get the line position of the lexer in the source text
     *
     * Returns: the position
     */
    public override ulong getLine()
    {
        return this.line;
    }

    /**
     * Get the column position of the lexer in the source text
     *
     * Returns: the position
     */
    public override ulong getColumn()
    {
        return this.column;
    }

    /**
     * Exhaustively provide a list of all tokens
     *
     * Returns: a `Token[]` containing all tokens
     */
    public override Token[] getTokens()
    {
        return tokens;
    }


    /**
    * Lexer state data
    */
    private string sourceCode; /* The source to be lexed */
    private ulong line = 1; /* Current line */
    private ulong column = 1;
    private Token[] currentTokens; /* Current token set */
    private string currentToken; /* Current token */
    private ulong position; /* Current character position */
    private char currentChar; /* Current character */
    private bool stringMode; /* Whether we are in a string "we are here" or not */
    private bool floatMode; /* Whether or not we are building a floating point constant */


    /* The tokens */
    private Token[] tokens;

    this(string sourceCode)
    {
        this.sourceCode = sourceCode;
    }

    private bool isForward()
    {
        return position+1 < sourceCode.length;
    }

    public bool isBackward()
    {
        return position-1 < sourceCode.length;
    }


    /**
    * Used for tokenising a2.b2
    *
    * When the `.` is encountered
    * and there are some characters
    * behind it this checks if we can
    * append a further dot to it
    */
    private bool isBuildUpValidIdent()
    {
        import tlang.compiler.symbols.check;
        return isPathIdentifier(currentToken) || isIdentifier(currentToken);
    }

    /**
    * Returns true if we have a token being built
    * false otherwise
    */
    private bool hasToken()
    {
        return currentToken.length != 0;
    }

    /* Perform the lexing process */
    /* TODO: Use return value */
    public void performLex()
    {

        while(position < sourceCode.length)
        {
            // gprintln("SrcCodeLen: "~to!(string)(sourceCode.length));
            // gprintln("Position: "~to!(string)(position));

            currentChar = sourceCode[position];

            if(floatMode == true)
            {
                if(isDigit(currentChar))
                {
                    /* tack on and move to next iteration */
                    currentToken~=currentChar;
                    position++;
                    column++;
                    continue;
                }
                /* TODO; handle closer case and error case */
                else
                {
                    /* TODO: Throw erropr here */
                    if(isSpliter(currentChar))
                    {
                        floatMode = false;
                        currentTokens ~= new Token(currentToken, line, column);
                        currentToken = "";

                        /* We just flush and catch splitter in next round, hence below is commented out */
                        // column++;
                        // position++;
                    }
                    else
                    {
                        throw new LexerException(this, "Floating point '"~currentToken~"' cannot be followed by a '"~currentChar~"'");
                    }
                }
            }
            else if(currentChar == ' ' && !stringMode)
            {
                /* TODO: Check if current token is fulled, then flush */
                if(currentToken.length != 0)
                {
                    currentTokens ~= new Token(currentToken, line, column);
                    currentToken = "";
                }

                column++;
                position++;
            }
            else if(isSpliter(currentChar) && !stringMode)
            {
                /* The splitter token to finally insert */
                string splitterToken;

                gprintln("Build up: "~currentToken);
                gprintln("Current char: "~currentChar);

                /* Check for case of `==` (where we are on the first `=` sign) */
                if(currentChar == '=' && isForward() && sourceCode[position+1] == '=')
                {
                    /* Flush any current token (if exists) */
                    if(currentToken.length)
                    {
                        currentTokens ~= new Token(currentToken, line, column);
                        currentToken = "";
                    }

                    // Create the `==` token
                    currentTokens ~= new Token("==", line, column);

                    // Skip over the current `=` and the next `=`
                    position+=2;

                    column+=2;

                    continue;
                }

                /* FIXME: Add floating point support here */
                /* TODO: IF buildUp is all numerical and we have dot go into float mode */
                /* TODO: Error checking will need to be added */
                if(isNumericalStr(currentToken) && currentChar == '.')
                {
                    /* Tack on the dot */
                    currentToken~=".";

                    /* Enable floating point mode and go to next iteration*/
                    floatMode = true;
                    gprintln("Float mode just got enabled: Current build up: \""~currentToken~"\"");
                    column++;
                    position++;
                    continue;
                }


                /**
                * Here we check if we have a `.` and that the characters
                * preceding us were all godd for an identifier
                */
                import misc.utils;

                if(currentChar == '.' && hasToken() && isBuildUpValidIdent())
                {
                    gprintln("Bruh");
                    /**
                    * Now we check that we have a character infront of us
                    * and that it is a letter
                    *
                    * TODO: Add _ check too as that is a valid identifier start
                    */
                    if(isForward() && isCharacterAlpha(sourceCode[position+1]))
                    {
                        position++;
                        column+=1;

                        currentToken ~= '.';

                        continue;
                    }
                    else
                    {
                        throw new LexerException(this, "Expected a letter to follow the .");
                    }

                }
                /* Check if we need to do combinators (e.g. for ||, &&) */
                /* TODO: Second operand in condition out of bounds */
                else if(currentChar == '|' && (position+1) != sourceCode.length && sourceCode[position+1] == '|')
                {
                    splitterToken = "||";
                    column += 2;
                    position += 2;
                }
                else if(currentChar == '&' && (position+1) != sourceCode.length && sourceCode[position+1] == '&')
                {
                    splitterToken = "&&";
                    column += 2;
                    position += 2;
                }
                else if (currentChar == '\n') /* TODO: Unrelated!!!!!, but we shouldn't allow this bahevaipur in string mode */
                {
                    line++;
                    column = 1;

                    position++;
                }
                else
                {
                    splitterToken = ""~currentChar;
                    column++;
                    position++;
                }


                /* Flush the current token (if one exists) */
                if(currentToken.length)
                {
                    currentTokens ~= new Token(currentToken, line, column);
                    currentToken = "";
                }

                /* Add the splitter token (only if it isn't empty) */
                if(splitterToken.length)
                {
                    currentTokens ~= new Token(splitterToken, line, column);
                }
            }
            else if(currentChar == '"')
            {
                /* If we are not in string mode */
                if(!stringMode)
                {
                    /* Add the opening " to the token */
                    currentToken ~= '"';

                    /* Enable string mode */
                    stringMode = true;
                }
                /* If we are in string mode */
                else
                {
                    /* Add the closing " to the token */
                    currentToken ~= '"';

                    /* Flush the token */
                    currentTokens ~= new Token(currentToken, line, column);
                    currentToken = "";

                    /* Get out of string mode */
                    stringMode = false;
                }

                column++;
                position++;
            }
            else if(currentChar == '\\')
            {
                /* You can only use these in strings */
                if(stringMode)
                {
                    /* Check if we have a next character */
                    if(position+1 != sourceCode.length && isValidEscape_String(sourceCode[position+1]))
                    {
                        /* Add to the string */
                        currentToken ~= "\\"~sourceCode[position+1];

                        column += 2;
                        position += 2;
                    }
                    /* If we don't have a next character then raise error */
                    else
                    {
                        throw new LexerException(this, "Unfinished escape sequence");
                    }
                }
                else
                {
                    throw new LexerException(this, "Escape sequences can only be used within strings");
                }
            }
            /* Character literal support */
            else if(!stringMode && currentChar == '\'')
            {
                currentToken ~= "'";

                /* Character literal must be next */
                if(position+1 != sourceCode.length)
                {
                    /* TODO: Escape support for \' */

                    /* Get the character */
                    currentToken ~= ""~sourceCode[position+1];
                    column++;
                    position++;


                    /* Closing ' must be next */
                    if(position+1 != sourceCode.length && sourceCode[position+1] == '\'')
                    {
                        /* Generate and add the token */
                        currentToken ~= "'";
                        currentTokens ~= new Token(currentToken, line, column);

                        /* Flush the token */
                        currentToken = "";

                        column += 2;
                        position += 2;
                    }
                    else
                    {
                        throw new LexerException(this, "Was expecting closing ' when finishing character literal");
                    }
                }
                else
                {
                    throw new LexerException(this, LexerError.EXHAUSTED_CHARACTERS, "EOSC reached when trying to get character literal");
                }
            }
            /**
            * If we are building up a number
            *
            * TODO: Build up token right at the end (#DuplicateCode)
            */
            else if(isBuildUpNumerical())
            {
                gprintln("jfdjkhfdjkhfsdkj");
                /* fetch the encoder segment */
                char[] encoderSegment = numbericalEncoderSegmentFetch();

                gprintln("isBuildUpNumerical(): Enter");

                /**
                * If we don't have any encoders
                */
                if(encoderSegment.length == 0)
                {
                    /* We can add a signage encoder */
                    if(isNumericalEncoder_Signage(currentChar))
                    {
                        gprintln("Hello");

                        /* Check if the next character is a size (it MUST be) */
                        if(isForward() && isNumericalEncoder_Size(sourceCode[position+1]))
                        {
                            currentToken ~= currentChar;
                            column++;
                            position++;


                        }
                        else
                        {
                            throw new LexerException(this, "You MUST specify a size encoder after a signagae encoder");
                        }


                    }
                    /* We can add a size encoder */
                    else if(isNumericalEncoder_Size(currentChar))
                    {
                        currentToken ~= currentChar;
                        column++;
                        position++;
                    }
                    /* We can add more numbers */
                    else if(isDigit(currentChar))
                    {
                        currentToken ~= currentChar;
                        column++;
                        position++;
                    }
                    /* Splitter (TODO) */
                    else if(isSpliter(currentChar))
                    {
                        /* Add the numerical literal as a new token */
                        currentTokens ~= new Token(currentToken, line, column);

                        /* Add the splitter token if not a newline */
                        if(currentChar != '\n')
                        {
                            currentTokens ~= new Token(""~currentChar, line, column);
                        }


                        /* Flush the token */
                        currentToken = "";

                        /* TODO: Check these */
                        column += 2;
                        position += 2;
                    }
                    /* Anything else is invalid */
                    else
                    {
                        throw new LexerException(this, "Not valid TODO");
                    }
                }
                /**
                * If we have one encoder
                */
                else if((encoderSegment.length == 1))
                {
                    /* Check what the encoder is */

                    /**
                    * If we had a signage then we must have a size after it
                    */
                    if(isNumericalEncoder_Signage(encoderSegment[0]))
                    {
                        /**
                        * Size encoder must then follow
                        */
                        if(isNumericalEncoder_Size(currentChar))
                        {
                            currentToken ~= currentChar;
                            column++;
                            position++;

                            /* Add the numerical literal as a new token */
                            currentTokens ~= new Token(currentToken, line, column);

                            /* Flush the token */
                            currentToken = "";

                        }
                        /**
                        * Anything else is invalid
                        */
                        else
                        {
                            throw new LexerException(this, "A size-encoder must follow a signage encoder");
                        }
                    }
                    else
                    {
                        throw new LexerException(this, "Cannot have another encoder after a size encoder");
                    }
                }
                /* It is impossible to reach this as flushing means we cannot add more */
                else
                {
                    assert(false);
                }


            }
            /* Any other case, keep building the curent token */
            else
            {
                currentToken ~= currentChar;
                column++;
                position++;
            }
        }

        /* If there was a token made at the end then flush it */
        if(currentToken.length)
        {
            currentTokens ~= new Token(currentToken, line, column);
        }

        tokens = currentTokens;
    }

    private char[] numbericalEncoderSegmentFetch()
    {
        char[] numberPart;
        ulong stopped;
        for(ulong i = 0; i < currentToken.length; i++)
        {
            char character = currentToken[i];

            if(isDigit(character))
            {
                numberPart~=character;
            }
            else
            {
                stopped = i;
                break;
            }
        }

        char[] remaining = cast(char[])currentToken[stopped..currentToken.length];

        return remaining;
    }

    /**
    * Returns true if the current build up is entirely
    * numerical
    *
    * FIXME: THis, probably by its own will pick up `UL`
    * as a number, or even just ``
    */
    private bool isBuildUpNumerical()
    {
        import std.ascii : isDigit;


        char[] numberPart;
        ulong stopped;
        for(ulong i = 0; i < currentToken.length; i++)
        {
            char character = currentToken[i];

            if(isDigit(character))
            {
                numberPart~=character;
            }
            else
            {
                stopped = i;
                break;
            }
        }

        /**
        * We need SOME numerical stuff
        */
        if(stopped == 0)
        {
            return false;
        }

        char[] remaining = cast(char[])currentToken[stopped..currentToken.length];

        char lstEncoder;

        for(ulong i = 0; i < remaining.length; i++)
        {
            char character = remaining[i];

            if(!isNumericalEncoder(character))
            {
                return false;
            }
        }

        return true;


    }

    /**
    * Given a string return true if all characters
    * are digits, false otherwise and false if
    * the string is empty
    */
    private static bool isNumericalStr(string input)
    {
        /**
        * If the given input is empty then return false
        */
        if(input.length == 0)
        {
            return false;
        }

        /**
         * If there are any characters in the string then
         * check if all are digits
         */
        for(ulong i = 0; i < input.length; i++)
        {
            char character = input[i];

            if(!isDigit(character))
            {
                return false;
            }
        }

        return true;
    }

    private bool isSpliter(char character)
    {
        return character == ';' || character == ',' || character == '(' ||
                character == ')' || character == '[' || character == ']' ||
                character == '+' || character == '-' || character == '/' ||
                character == '%' || character == '*' || character == '&' ||
                character == '{' || character == '}' || character == '=' ||
                character == '|' || character == '^' || character == '!' ||
                character == '\n' || character == '~' || character =='.' ||
                character == ':'; //|| isNumericalEncoder(character);
    }

    private bool isNumericalEncoder(char character)
    {
        return isNumericalEncoder_Size(character) ||
               isNumericalEncoder_Signage(character);
    }

    private bool isNumericalEncoder_Size(char character)
    {
        return character == 'B' || character == 'W' ||
               character == 'I' || character == 'L';
    }

    private bool isNumericalEncoder_Signage(char character)
    {
        return character == 'S' || character == 'U';
    }

    /* Supported escapes \" */
    public bool isValidEscape_String(char character)
    {
        return true; /* TODO: Implement me */
    }
}

/* Test input: `hello "world";` */
unittest
{
    import std.algorithm.comparison;
    string sourceCode = "hello \"world\";";
    BasicLexer currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token("\"world\"", 0, 0), new Token(";", 0, 0)]);
}

/* Test input: `hello "world"|| ` */
unittest
{
    import std.algorithm.comparison;
    string sourceCode = "hello \"world\"|| ";
    BasicLexer currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token("\"world\"", 0, 0), new Token("||", 0, 0)]);
}

/* Test input: `hello "world"||` */
unittest
{
    import std.algorithm.comparison;
    string sourceCode = "hello \"world\"||";
    BasicLexer currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token("\"world\"", 0, 0), new Token("||", 0, 0)]);
}

/* Test input: `hello "world"|` */
unittest
{
    import std.algorithm.comparison;
    string sourceCode = "hello \"world\";|";
    BasicLexer currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token("\"world\"", 0, 0), new Token(";", 0, 0), new Token("|", 0, 0)]);
}

/* Test input: `     hello` */
unittest
{
    import std.algorithm.comparison;
    string sourceCode = " hello";
    BasicLexer currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("hello", 0, 0)]);
}

/* Test input: `hello;` */
unittest
{
    import std.algorithm.comparison;
    string sourceCode = " hello;";
    BasicLexer currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token(";", 0, 0)]);
}

/* Test input: `hello "world\""` */
unittest
{
    import std.algorithm.comparison;
    string sourceCode = "hello \"world\\\"\"";
    BasicLexer currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("hello", 0, 0), new Token("\"world\\\"\"", 0, 0)]);
}

/* Test input: `'c'` */
unittest
{
    import std.algorithm.comparison;
    string sourceCode = "'c'";
    BasicLexer currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("'c'", 0, 0)]);
}

/* Test input: `2121\n2121` */
unittest
{
    import std.algorithm.comparison;
    string sourceCode = "2121\n2121";
    BasicLexer currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("2121", 0, 0), new Token("2121", 0, 0)]);
}

/**
* Test `=`` and `==` handling
*/
unittest
{
    import std.algorithm.comparison;
    string sourceCode = " =\n";
    BasicLexer currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("=", 0, 0)]);

    import std.algorithm.comparison;
    sourceCode = " = ==\n";
    currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("=", 0, 0), new Token("==", 0, 0)]);

    import std.algorithm.comparison;
    sourceCode = " ==\n";
    currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("==", 0, 0)]);

    import std.algorithm.comparison;
    sourceCode = " = =\n";
    currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("=", 0, 0), new Token("=", 0, 0)]);

    import std.algorithm.comparison;
    sourceCode = " ==, = ==\n";
    currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("==", 0, 0), new Token(",", 0, 0), new Token("=", 0, 0), new Token("==", 0, 0)]);

    // Test flushing of previous token
    import std.algorithm.comparison;
    sourceCode = "i==i=\n";
    currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("i", 0, 0), new Token("==", 0, 0), new Token("i", 0, 0), new Token("=", 0, 0)]);
}

/**
* Test: Literal value encoding
*
* Tests validity
*/
unittest
{
    import std.algorithm.comparison;
    string sourceCode;
    BasicLexer currentLexer;

    /* 21L (valid) */
    sourceCode = "21L";
    currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("21L", 0, 0)]);

    /* 21UL (valid) */
    sourceCode = "21UL";
    currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("21UL", 0, 0)]);

    // /* 21U (invalid) */
    // sourceCode = "21U ";
    // currentLexer = new Lexer(sourceCode);
    // // gprintln(currentLexer.performLex());
    // bool status = currentLexer.performLex();
    // gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    // assert(!status);


    // /* 21UL (valid) */
    // sourceCode = "21UL";
    // currentLexer = new Lexer(sourceCode);
    // currentLexer.performLex();
    // gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    // assert(currentLexer.getTokens() == [new Token("21UL", 0, 0)]);


}

/* Test input: `1.5` */
unittest
{
    import std.algorithm.comparison;
    string sourceCode = "1.5";
    BasicLexer currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [new Token("1.5", 0, 0)]);
}

/**
* Test correct handling of dot-operator for
* non-floating point cases
*
* Input: `new A().l.p.p;`
*/
unittest
{
    import std.algorithm.comparison;
    string sourceCode = "new A().l.p.p;";
    BasicLexer currentLexer = new BasicLexer(sourceCode);
    currentLexer.performLex();
    gprintln("Collected "~to!(string)(currentLexer.getTokens()));
    assert(currentLexer.getTokens() == [
        new Token("new", 0, 0),
        new Token("A", 0, 0),
        new Token("(", 0, 0),
        new Token(")", 0, 0),
        new Token(".", 0, 0),
        new Token("l.p.p", 0, 0),
        new Token(";", 0, 0)
    ]);
}