// CS241 Scanner

// Use as a model for assignments 3 and beyond if you wish
// Code contained here may be included in submissions for CS241
// assignments at the University of Waterloo.

// Modified Nov. 1, 2009 to correct bug in scanning empty line

// Gordon V. Cormack,  October 2009
// adapted from a version by Brad Lushman, May 2009
// original Java version due to Ondrej Lhotak, September 2007

//==============================================================================
//========= Declarations for the scan() function ===============================
//==============================================================================

// These restrictions will be added to the MIPS Assembly Language Specification

#define MAXLINE 255         // max. allowable input line length
#define MAXSTR (MAXLINE+2)  // space to store 1 char overlength + null terminator
#define MAXLINES 100000     // max. allowable input lines
#define MAXLABELS 10000     // max. allowable unique labels

// Each token has one of the following kinds

typedef enum {
    ID,             	// Opcode or identifier (use of a label)
    INT,            	// Decimal integer
    HEXINT,         	// Hexadecimal integer
    REGISTER,       	// Register number
    COMMA,          	// Comma
    LPAREN,         	// (
    RPAREN,         	// )
    LABEL,          	// Declaration of a label (with a colon)
    DOTWORD,        	// .word directive
    WHITESPACE,     	// Whitespace
    NUL             	// Bad/invalid token
} Kind;

// Each token is described by its kind, its lexeme and,
//  for kinds INT, HEXINT, REGISTER, its value as long long (64 bits)

typedef struct {
   Kind kind;
   char lexeme[MAXSTR];
   long long value;
} Token;

// scan() separates input line into tokens
//        returns an array of Token
//        sets num_tokens to the number of tokens scanned

Token *scan(char *input, int *num_tokens);

// kindString(k) returns string representation of kind k
// useful for error/debugging messages

char *kindString(Kind k);

// getline() reads a line of up to MAXLINE+1 characters
// returns NULL on end of file

char *getline();

// =========================================================================
// Implementation of scan() and associated type definitions
// If you just want to use the scanner, skip to next ===== separator


#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// States for the finite-state automaton that runs the scanner

typedef enum {
    ST_NUL,
    ST_START,
    ST_DOLLAR,
    ST_MINUS,
    ST_REGISTER,
    ST_INT,
    ST_ID,
    ST_LABEL,
    ST_COMMA,
    ST_LPAREN,
    ST_RPAREN,
    ST_ZERO,
    ST_ZEROX,
    ST_HEXINT,
    ST_COMMENT,
    ST_DOT,
    ST_DOTW,
    ST_DOTWO,
    ST_DOTWOR,
    ST_DOTWORD,
    ST_WHITESPACE
} State;

// The kind of token represented by each state

Kind stateKinds[] = {
    NUL, 		// ST_NUL
    WHITESPACE,		// ST_START
    NUL, 		// ST_DOLLAR
    NUL, 		// ST_MINUS
    REGISTER, 		// ST_REGISTER
    INT, 		// ST_INT
    ID,  		// ST_ID
    LABEL, 		// ST_LABEL
    COMMA, 		// ST_COMMA
    LPAREN, 		// ST_LPAREN
    RPAREN, 		// ST_RPAREN
    INT, 		// ST_ZERO
    NUL, 		// ST_ZEROX
    HEXINT, 		// ST_HEXINT
    WHITESPACE,		// ST_COMMENT
    NUL, 		// ST_DOT
    NUL, 		// ST_DOTW
    NUL, 		// ST_DOTWO
    NUL, 		// ST_DOTWOR
    DOTWORD, 		// ST_DOTWORD
    WHITESPACE 		// ST_WHITESPACE
};

State T[ST_WHITESPACE+1][256];

#define whitespace "\t\n\r "
#define letters   "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
#define digits    "0123456789"
#define hexDigits "0123456789ABCDEFabcdef"
#define oneToNine "123456789"
#define all NULL

void setT(int from, unsigned char *c, int to) {
   unsigned char *p;
   for (p=c;*p;p++) T[from][*p] = to;
}

void initT(){
   int i,j;
   // default transition is ST_NUL
   for (i=0;i<=ST_WHITESPACE;i++) {
      for (j=0;j<256;j++) {
         T[i][j] = ST_NUL;
      }
   }
   // non-null transitions in finite state machine
   setT( ST_START,    whitespace,     ST_WHITESPACE );
   setT( ST_START,    letters,        ST_ID         );
   setT( ST_ID,       letters digits, ST_ID         );
   setT( ST_START,    oneToNine,      ST_INT        );
   setT( ST_INT,      digits,         ST_INT        );
   setT( ST_START,    "-",            ST_MINUS      );
   setT( ST_MINUS,    oneToNine,      ST_INT        );
   setT( ST_START,    ",",            ST_COMMA      );
   setT( ST_START,    "(",            ST_LPAREN     );
   setT( ST_START,    ")",            ST_RPAREN     );
   setT( ST_START,    "$",            ST_DOLLAR     );
   setT( ST_DOLLAR,   digits,         ST_REGISTER   );
   setT( ST_REGISTER, digits,         ST_REGISTER   );
   setT( ST_START,    "0",            ST_ZERO       );
   setT( ST_ZERO,     "x",            ST_ZEROX      );
   setT( ST_ZERO,     oneToNine,      ST_INT        );
   setT( ST_ZEROX,    hexDigits,      ST_HEXINT     );
   setT( ST_HEXINT,   hexDigits,      ST_HEXINT     );
   setT( ST_ID,       ":",            ST_LABEL      );
   setT( ST_START,    ";",            ST_COMMENT    );
   setT( ST_START,    ".",            ST_DOT        );
   setT( ST_DOT,      "w",            ST_DOTW       );
   setT( ST_DOTW,     "o",            ST_DOTWO      );
   setT( ST_DOTWO,    "r",            ST_DOTWOR     );
   setT( ST_DOTWOR,   "d",            ST_DOTWORD    );

   // comment includes all characters
   for (j=0;j<256;j++) T[ST_COMMENT][j] = ST_COMMENT;
}

Token *scan(char *input, int *len){
   Token r[MAXLINE]; // can't be more than MAXLINE tokens!
   int inlen = strlen(input);
   State state = ST_START;
   int i, tok_num, lexeme_len;

   static int done = 0;
   if (!done && (done=1)) initT();

   for (i=tok_num=lexeme_len=0; i<=inlen; i++,lexeme_len++) {
      // process each character in input, including NUL terminator
      State t = T[state][input[i]];
      if (t == ST_NUL) {
         // process token
         r[tok_num].lexeme[lexeme_len] = 0; // NUL string terminator 
         r[tok_num].kind = stateKinds[state];
         if (r[tok_num].kind == INT) {
            sscanf(r[tok_num].lexeme,"%lld",&r[tok_num].value);
            if (strlen(r[tok_num].lexeme) > 11) r[tok_num].kind = NUL; // number too big
            if (r[tok_num].value < -2147483647-1  || r[tok_num].value > 2147483647 ) r[tok_num].kind = NUL; 
         } else if (r[tok_num].kind == HEXINT) {
            sscanf(r[tok_num].lexeme,"%llx",&r[tok_num].value);
            if (strlen(r[tok_num].lexeme) > 10) r[tok_num].kind = NUL; // number too big
         } else if (r[tok_num].kind == REGISTER) {
            sscanf(r[tok_num].lexeme+1,"%lld",&r[tok_num].value);
            if (r[tok_num].value > 31) r[tok_num].kind = NUL;       // invalid register
         } else r[tok_num].value = 0x1234567812345678LL; // shouldn't ever use this value
         if (r[tok_num].kind != WHITESPACE) tok_num++; // skip whitespace
         lexeme_len = 0;
         r[tok_num].lexeme[lexeme_len] = input[i];
         state = T[ST_START][input[i]];
      } else {
         r[tok_num].lexeme[lexeme_len] = input[i]; // copy next lexeme char
         state = t;
      }
   }

   *len = tok_num;
   Token *rcopy = (Token *) malloc(i * sizeof(Token));
   for (i=0;i<tok_num;i++) rcopy[i] = r[i];
   return rcopy;
}

// kindString maps each kind to a string for use in error messages

char * kS [] = { 
    "ID",		// ID
    "INT",		//INT
    "HEXINT",		// HEXINT
    "REGISTER",		// REGISTER
    "COMMA",		// COMMA
    "LPAREN",		// LPAREN
    "RPAREN",		// RPAREN
    "LABEL",		// LABEL
    "DOTWORD",		// DOTWORD
    "WHITESPACE",	// WHITESPACE
    "NUL" 		// NUL
};

char *kindString(Kind k){
   if (k < ID || k > NUL) return "INVALID";
   return kS[k];
}

char *getline() {
   int i,c;
   char buf[MAXSTR];
   for (i=0; i < MAXLINE && EOF != (c=getchar()) && '\n' != c ; i++) {
      buf[i] = c;
   }
   if (EOF == c && i == 0) return NULL;  // end of file
   buf[i] = 0;                           // null string terminator
   return strdup(buf);                   // return copy of string
}

//==============================================================================
// ====== A sample program demonstrating the use of the scanner. ===============
//==============================================================================

char * lin[MAXLINES+1];  // space for max lines plus one extra
int line, lines;

void check(int x, char *msg){
  if (!x) {
     fprintf(stderr,"%s\n",lin[line]);
     fprintf(stderr,"ERROR - %s\n",msg);
     exit(1);
  }
} 

int main() {

   // read input lines
   for (lines=0; lin[lines] = getline(); lines++ ) {
      check(strlen(lin[lines]) <= MAXLINE,"input line too long");
      check(lines < MAXLINES, "too many input lines");
   }

   // print tokens on each line
   for (line=0; line < lines; line++) {
      int tlist_length;
      Token *tlist = scan(lin[line],&tlist_length);

      int i;
      for( i = 0; i < tlist_length ; i++) {
         printf("   Token: %s {%s}\n",kindString(tlist[i].kind),tlist[i].lexeme);
      }
      free(tlist);
   }

   // free storage
   for (line=0; line < lines; line++) {
      free(lin[line]);
   }
}

