/*  CS241 Scanner

    Starter code for the CS 241 assembler (assignments 3 and 4).
    Code contained here may be included in submissions for CS241
    assignments at the University of Waterloo.

    Modified January 2011 by Simon Parent.
    Modified 30 Jan 2010 by JCBeatty.
    (LD_LIBRARY_PATH changed to LIBRARY_PATH in comment.)
    Modified Nov. 1, 2009 to correct a bug in scanning empty lines.
    Gordon V. Cormack, October 2009
    Adapted from a version by Brad Lushman, May 2009
    Original Java version due to Ondrej Lhotak, September 2007

    ---------------------------------------------------------------

    To compile on a CSCF linux machine, use:

            gcc -g asm.c -o asm

    To run:
            ./asm           < source.asm > program.mips
            valgrind ./asm  < source.asm > program.mips
 */

//======================================================================
//========= Declarations for the scan() function =======================
//======================================================================
//
// The DFA on which this scanner is based has a very nice
// property: there are NO transitions from an accepting
// state to a non-accepting state. As a result, Maximal
// Munch is the same as Simplified Maximal Munch (discussed
// in week 6) - there would be no point in keeping track
// of the most recently visited accepting state.

// Each token has one of the following kinds.

typedef enum {
    ID,                 // Opcode or identifier (use of a label)
    INT,                // Decimal integer
    HEXINT,             // Hexadecimal integer
    REGISTER,           // Register number
    COMMA,              // Comma
    LPAREN,             // (
    RPAREN,             // )
    LABEL,              // Declaration of a label (with a colon)
    DOTWORD,            // .word directive
    WHITESPACE,         // Whitespace
    NUL                 // Bad/invalid token
} Kind;

// Each token is described by its kind, its lexeme and,
// for kinds INT, HEXINT, REGISTER, its integer value.

typedef struct {
    Kind      kind;
    char      *lexeme;
    long long value;    // value is at least 64 bits wide.
} Token;

// scan() separates an input line into tokens,
//        returns an array of Token, and
//        sets num_tokens to the number of tokens scanned.

Token *scan( char *input, int *num_tokens );

// kindString(k) returns string a representation of kind k
// that is useful for error and debugging messages.

char *kindString( Kind k );

// read_line() reads a line of text from standard input.
// It returns NULL on end of file.

char *read_line( void );

// =====================================================================
// The implementation of scan() and associated type definitions.
// If you just want to use the scanner, skip to the next ==== separator.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// States for the finite-state automaton that comprises the scanner.

typedef enum {
    ST_NUL,
    ST_START,
    ST_DOLLAR,
    ST_MINUS,
    ST_REGISTER,
    ST_INT,
    ST_ID,
    ST_LABEL,
    ST_COMMA,
    ST_LPAREN,
    ST_RPAREN,
    ST_ZERO,
    ST_ZEROX,
    ST_HEXINT,
    ST_COMMENT,
    ST_DOT,
    ST_DOTW,
    ST_DOTWO,
    ST_DOTWOR,
    ST_DOTWORD,
    ST_WHITESPACE
} State;

// The *kind* of token (see previous enum declaration)
// represented by each state; states that don't represent
// a token have kind == NUL.  NUL is guaranteed to be
// represented by 0, WHITESPACE by 1, etc.  Hence the
// number of values is WHITESPACE+1.

Kind stateKinds[] = {
    NUL,            // ST_NUL
    WHITESPACE,     // ST_START
    NUL,            // ST_DOLLAR
    NUL,            // ST_MINUS
    REGISTER,       // ST_REGISTER
    INT,            // ST_INT
    ID,             // ST_ID
    LABEL,          // ST_LABEL
    COMMA,          // ST_COMMA
    LPAREN,         // ST_LPAREN
    RPAREN,         // ST_RPAREN
    INT,            // ST_ZERO
    NUL,            // ST_ZEROX
    HEXINT,         // ST_HEXINT
    WHITESPACE,     // ST_COMMENT
    NUL,            // ST_DOT
    NUL,            // ST_DOTW
    NUL,            // ST_DOTWO
    NUL,            // ST_DOTWOR
    DOTWORD,        // ST_DOTWORD
    WHITESPACE      // ST_WHITESPACE
};

State delta[ST_WHITESPACE+1][256];

#define whitespace "\t\n\r "
#define letters    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
#define digits     "0123456789"
#define hexDigits  "0123456789ABCDEFabcdef"
#define oneToNine  "123456789"

void setT( State from, const char *c, State to ) {
    int idx;
    for( idx = 0; idx < strlen(c); idx++ ) delta[from][c[idx]] = to;
}

void initT( void ){

    // fprintf( stderr, "SizeOfT(%) = %d bytes\n.", sizeof(T) );
    int i, j;

    // The default transition is ST_NUL (i.e., no transition
    // defined for this char).
    for ( i=0; i<=ST_WHITESPACE; i++ ) {
        for ( j=0; j<256; j++ ) {
            delta[i][j] = ST_NUL;
        }
    }
    // Non-null transitions of the finite state machine.
    // NB: in the third line below, letters digits are macros
    // that are replaced by string literals, which the compiler
    // will concatenate into a single string literal.
    setT( ST_START,      whitespace,     ST_WHITESPACE );
    setT( ST_WHITESPACE, whitespace,     ST_WHITESPACE );
    setT( ST_START,      letters,        ST_ID         );
    setT( ST_ID,         letters digits, ST_ID         );
    setT( ST_START,      oneToNine,      ST_INT        );
    setT( ST_INT,        digits,         ST_INT        );
    setT( ST_START,      "-",            ST_MINUS      );
    setT( ST_MINUS,      oneToNine,      ST_INT        );
    setT( ST_START,      ",",            ST_COMMA      );
    setT( ST_START,      "(",            ST_LPAREN     );
    setT( ST_START,      ")",            ST_RPAREN     );
    setT( ST_START,      "$",            ST_DOLLAR     );
    setT( ST_DOLLAR,     digits,         ST_REGISTER   );
    setT( ST_REGISTER,   digits,         ST_REGISTER   );
    setT( ST_START,      "0",            ST_ZERO       );
    setT( ST_ZERO,       "x",            ST_ZEROX      );
    setT( ST_ZERO,       oneToNine,      ST_INT        );
    setT( ST_ZEROX,      hexDigits,      ST_HEXINT     );
    setT( ST_HEXINT,     hexDigits,      ST_HEXINT     );
    setT( ST_ID,         ":",            ST_LABEL      );
    setT( ST_START,      ";",            ST_COMMENT    );
    setT( ST_START,      ".",            ST_DOT        );
    setT( ST_DOT,        "w",            ST_DOTW       );
    setT( ST_DOTW,       "o",            ST_DOTWO      );
    setT( ST_DOTWO,      "r",            ST_DOTWOR     );
    setT( ST_DOTWOR,     "d",            ST_DOTWORD    );

    // ST_COMMENT includes all characters, so it will consist of
    // the entire remainder of the line.  This suffices because
    // for the assembler, scan(...) is always passed one line
    // at a time. If we were scanning an entire source file,
    // we'd need to terminate the comment when we encountered
    // an end-of-line (\n, \r, or \r\n, depending on the
    // platform). As it happens, this routine won't ever see
    // the end-of-line because read_line(...) doesn't include
    // it in the strings it returns.
    for ( j=0; j<256; j++ ) delta[ST_COMMENT][j] = ST_COMMENT;
}

static int initT_done = 0;

Token *scan( char *input, int *len ){

    Token *tokens = (Token*)malloc(sizeof(Token));
    tokens[0].lexeme = (char*)malloc(strlen(input) + 1);
    int tok_array_size = 1;

    int inlen = strlen(input);
    State state = ST_START;  // The current state of the DFA.
    int i, tok_num, lexeme_len;

    // Initialize the transition table when called for the first time.
    if(!initT_done) {
        initT();
        initT_done = 1;
    }

    for ( i = tok_num = lexeme_len = 0; i <= inlen; i++, lexeme_len++ ) {
        // Process each character of input, including the NUL terminator
        State nextState = delta[state][input[i]];
        if ( nextState != ST_NUL ) {
            // Copy the next character of the lexeme.
            tokens[tok_num].lexeme[lexeme_len] = input[i];
            state = nextState;
        } else {
            // We've come to the end of a token. Process
            // it and get ready to look for the next token.
            // If the line starts with one or more blanks (ie
            // whitespace), then nextState will be ST_NULL
            // and the first one or more times through this
            // loop we'll execute this portion of the if-stmt,
            // the effect of which is to swallow the whitespace
            // character and see where the NEXT input character
            // takes the DFA from ST_START. If the line begins
            // with a non-whitespace character - presumably an
            // ID or a LABEL - then execution flows instead to
            // the else part of the if-stmt until we reach the
            // last character of the token, from which *this*
            // DFA will have no transition (t == ST_NUL),
            // which causes this portion of the if-stmt to be
            // executed so that we can package up the token and
            // get set to look for the next one. NB: this is
            // what we'll call a "simplified maximal munch"
            // scanner based on the DFA for MIPS tokens -
            // it is not the most general case, but we'll
            // postpone a discussion of more general lexical
            // scanners to a lecture in week 6.

            // Tack on the NUL character that marks the end
            // of the lexeme string.
            tokens[tok_num].lexeme[lexeme_len] = '\0';

            tokens[tok_num].kind = stateKinds[state];

            if ( tokens[tok_num].kind == INT ) {
                // Get the INT's binary representation.
                sscanf( tokens[tok_num].lexeme, "%lld",
                        &tokens[tok_num].value );
                // Make sure that the number is not too big.
                if ( strlen(tokens[tok_num].lexeme) > 11 )
                    tokens[tok_num].kind = NUL;
                if ( tokens[tok_num].value < -2147483647-1 ||
                        tokens[tok_num].value > 4294967295LL )
                    tokens[tok_num].kind = NUL;
                if ( tokens[tok_num].kind == NUL ) {
                    fprintf( stderr, "ERROR: INT %s out of range.\n",
                            tokens[tok_num].lexeme );
                    exit(1);
                }
            } else if (tokens[tok_num].kind == HEXINT) {
                // Get the HEXINT's binary representation.
                sscanf( tokens[tok_num].lexeme, "%llx",
                        &tokens[tok_num].value );
                // Make sure that the number is not too big.
                if ( strlen(tokens[tok_num].lexeme) > 10 )
                    tokens[tok_num].kind = NUL;
                if ( tokens[tok_num].kind == NUL ) {
                    fprintf( stderr, "ERROR: HEXINT %s out of range.\n",
                            tokens[tok_num].lexeme );
                    exit(1);
                }
            } else if ( tokens[tok_num].kind == REGISTER ) {
                // Get the INT's binary representation.
                sscanf( tokens[tok_num].lexeme+1, "%lld",
                        &tokens[tok_num].value );
                // Ensure that the register number is valid.
                if ( tokens[tok_num].value > 31 )
                    tokens[tok_num].kind = NUL;
                if ( tokens[tok_num].kind == NUL ) {
                    fprintf( stderr, "ERROR: REG %s out of range.\n",
                            tokens[tok_num].lexeme );
                    exit(1);
                }
            } else {
                // All other token kinds get this value,
                // which should never be used.
                tokens[tok_num].value = 0x1234567812345678LL;
            }

            // Set the DFA up to scan for the next token. Notice
            // that we ignore the current token (i.e., build
            // the new one on top of the one we just finished)
            // if the latter was a whitespace token because
            // we don't tell the caller about whitespace tokens.
            if( tokens[tok_num].kind != WHITESPACE ) {
                tok_num++;
                // If the tokens array does not have enough space
                // for the next token, make it larger with realloc.
                if(tok_num >= tok_array_size) {
                    tok_array_size = tok_array_size * 2 + 1;
                    tokens = (Token*)realloc(tokens,
                            tok_array_size * sizeof(tokens[0]));
                }
                // Also, allocate space for the new token's lexeme.
                // It can't be longer than the current line, so
                // just allocate enough space for the current line.
                tokens[tok_num].lexeme = (char*)malloc(strlen(input) + 1);
            }
            lexeme_len = 0;
            // The new token starts with the next input character.
            tokens[tok_num].lexeme[lexeme_len] = input[i];
            // Put the DFA in its start state and fetch the
            // transition from the start state for that character.
            state = delta[ST_START][input[i]];
        }
    }
    // The number of tokens we found in this line is passed
    // back to the caller via len.
    *len = tok_num;
    return tokens;
}

// kindString maps each kind to a string for use in error messages.

const char * kS [] = {
    "ID",           // ID
    "INT",          // INT
    "HEXINT",       // HEXINT
    "REGISTER",     // REGISTER
    "COMMA",        // COMMA
    "LPAREN",       // LPAREN
    "RPAREN",       // RPAREN
    "LABEL",        // LABEL
    "DOTWORD",      // DOTWORD
    "WHITESPACE",   // WHITESPACE
    "NUL"           // NUL
};

char *kindString( Kind k ){
    if ( k < ID || k > NUL ) return (char*) "INVALID";
    return (char*) kS[k];
}

char *read_line( void ) {
    // Read a line from standard input, and return this line
    // as a dynamically-allocated string.
    char *buf = NULL; int bufsize = 0;
    int i, c = 0;
    for(i=0;;i++) {
        c = getchar();
        if(c == EOF && i == 0) return NULL;
        if(c == EOF || c == '\n') c = '\0';
        if(i >= bufsize) {
            // Our buffer has no space left, so allocate a larger one.
            bufsize = 2 * bufsize + 1;
            buf = (char*)realloc(buf, bufsize*sizeof(buf[0]));
        }
        buf[i] = c;
        if(c == '\0')
            break;
    }
    return buf;
}

//======================================================================
//======= A sample program demonstrating the use of the scanner. =======
//======================================================================

int main() {

    // Read the entire input file, storing each line as a
    // single string in the array srcLines.
    char **srcLines = NULL;
    int numLinesAllocated = 0;
    int numLines;
    for (numLines=0;;numLines++) {
        char *buf = read_line();
        if(buf == NULL)
            break;
        if(numLines >= numLinesAllocated) {
            // The srcLines array does not have enough space
            // for the next line, so enlarge it with realloc.
            numLinesAllocated = 2 * numLinesAllocated + 1;
            srcLines = (char**)realloc(srcLines,
                    numLinesAllocated * sizeof(srcLines[0]));
        }
        srcLines[numLines] = buf;
    }

    // Tokenize each line, storing the results in tokLines.
    // tokCounts stores the number of tokens per line, so that
    // the tokens for line i are:
    // tokLines[i][0], tokLines[i][1], ..., tokLines[i][tokCounts[i]-1].
    Token **tokLines = (Token**)malloc(sizeof(Token*) * numLines);
    int *tokCounts = (int*)malloc(sizeof(int) * numLines);
    int line;
    for ( line=0; line < numLines; line++ ) {
        tokLines[line] = scan(srcLines[line], &tokCounts[line]);
    }


    // Now we process the tokens.
    // Sample usage: print the tokens of each line.
    for ( line=0; line < numLines; line++ ) {
        int j;
        for ( j=0; j < tokCounts[line]; j++ ) {
            Token token = tokLines[line][j];
            fprintf( stderr, "  Token: %s {%s}",
                    kindString(token.kind), token.lexeme );
            if(token.kind == INT || token.kind == HEXINT ||
                    token.kind == REGISTER) {
                fprintf( stderr, " (%lld)", token.value );
            }
            fprintf( stderr, "\n" );
        }
    }


    // Free all dynamically-allocated memory.
    // This is not strictly necessary, since the operating
    // system would reclaim the memory anyway when the program
    // exits, but it makes it easier to check for memory leaks
    // with tools like valgrind.
    for ( line=0; line < numLines; line++ ) {
        int j;
        for ( j=0; j <= tokCounts[line]; j++ ) {
            free( tokLines[line][j].lexeme );
        }
        free( srcLines[line] );
        free( tokLines[line] );
    }
    free(srcLines);
    free(tokLines);
    free(tokCounts);

    return 0;
}

