608 lines
18 KiB
C++
608 lines
18 KiB
C++
// Copyright 2023 xensik. All rights reserved.
|
|
//
|
|
// Use of this source code is governed by a GNU GPLv3 license
|
|
// that can be found in the LICENSE file.
|
|
|
|
#include "stdinc.hpp"
|
|
#include "lexer.hpp"
|
|
#include "context.hpp"
|
|
#include "utils/string.hpp"
|
|
|
|
namespace xsk::gsc
|
|
{
|
|
|
|
lexer::lexer(context const* ctx, std::string const& name, char const* data, usize size) : ctx_{ ctx }, reader_{ data, size }, loc_{ &name }, buflen_{ 0 }, spacing_{ spacing::null }, indev_{ false }
|
|
{
|
|
}
|
|
|
|
auto lexer::lex() -> token
|
|
{
|
|
buflen_ = 0;
|
|
|
|
while (true)
|
|
{
|
|
auto& last = reader_.last_byte;
|
|
auto& curr = reader_.curr_byte;
|
|
auto path = false;
|
|
auto localize = false;
|
|
loc_.step();
|
|
|
|
if (reader_.ended())
|
|
{
|
|
if (indev_)
|
|
throw comp_error(loc_, "unmatched devblock start ('/#')");
|
|
|
|
if (curr == 0 && last != '\n')
|
|
{
|
|
curr = -1;
|
|
return token{ token::NEWLINE, spacing_, loc_ };
|
|
}
|
|
else
|
|
return token{ token::EOS, spacing_, loc_ };
|
|
}
|
|
|
|
if (last == 0 || last == '\n')
|
|
spacing_ = spacing::null;
|
|
else if (last == ' ' || last == '\t')
|
|
spacing_ = (spacing_ == spacing::null) ? spacing::empty : spacing::back;
|
|
else
|
|
spacing_ = spacing::none;
|
|
|
|
advance();
|
|
|
|
switch (last)
|
|
{
|
|
case ' ':
|
|
case '\t':
|
|
case '\r':
|
|
loc_.step();
|
|
continue;
|
|
case '\n':
|
|
loc_.lines();
|
|
loc_.step();
|
|
return token{ token::NEWLINE, spacing_, loc_ };
|
|
case '\\':
|
|
throw comp_error(loc_, "invalid token ('\\')");
|
|
case '/':
|
|
if (curr != '=' && curr != '#' && curr != '@' && curr != '*' && curr != '/')
|
|
return token{ token::DIV, spacing_, loc_ };
|
|
|
|
advance();
|
|
|
|
if (last == '=')
|
|
return token{ token::DIVEQ, spacing_, loc_ };
|
|
|
|
if (last == '#')
|
|
{
|
|
if (indev_)
|
|
throw comp_error(loc_, "cannot recurse devblock ('/#')");
|
|
|
|
if (ctx_->build() == build::dev)
|
|
{
|
|
indev_ = true;
|
|
return token{ token::DEVBEGIN, spacing_, loc_ };
|
|
}
|
|
else
|
|
{
|
|
while (true)
|
|
{
|
|
if (reader_.ended())
|
|
throw comp_error(loc_, "unmatched devblock start ('/#')");
|
|
|
|
if (curr == '\n')
|
|
{
|
|
loc_.lines();
|
|
loc_.step();
|
|
}
|
|
else if (last == '#' && curr == '/')
|
|
{
|
|
advance();
|
|
break;
|
|
}
|
|
|
|
advance();
|
|
}
|
|
}
|
|
}
|
|
else if (last == '@')
|
|
{
|
|
while (true)
|
|
{
|
|
if (reader_.ended())
|
|
throw comp_error(loc_, "unmatched script doc comment start ('/@')");
|
|
|
|
if (curr == '\n')
|
|
{
|
|
loc_.lines();
|
|
loc_.step();
|
|
}
|
|
else if (last == '@' && curr == '/')
|
|
{
|
|
advance();
|
|
break;
|
|
}
|
|
|
|
advance();
|
|
}
|
|
}
|
|
else if (last == '*')
|
|
{
|
|
while (true)
|
|
{
|
|
if (reader_.ended())
|
|
throw comp_error(loc_, "unmatched multiline comment start ('/*')");
|
|
|
|
if (curr == '\n')
|
|
{
|
|
loc_.lines();
|
|
loc_.step();
|
|
}
|
|
else if (last == '*' && curr == '/')
|
|
{
|
|
advance();
|
|
break;
|
|
}
|
|
|
|
advance();
|
|
}
|
|
}
|
|
else if (last == '/')
|
|
{
|
|
while (true)
|
|
{
|
|
if (reader_.ended())
|
|
break;
|
|
|
|
if (curr == '\n')
|
|
break;
|
|
|
|
advance();
|
|
}
|
|
}
|
|
continue;
|
|
case '#':
|
|
if (curr == '/')
|
|
{
|
|
if (!indev_)
|
|
throw comp_error(loc_, "unmatched devblock end ('#/')");
|
|
|
|
advance();
|
|
indev_ = false;
|
|
return token{ token::DEVEND, spacing_, loc_ };
|
|
}
|
|
|
|
return token{ token::SHARP, spacing_, loc_ };
|
|
case '*':
|
|
if (curr != '=' && curr != '/')
|
|
return token{ token::STAR, spacing_, loc_ };
|
|
|
|
advance();
|
|
|
|
if (last == '=')
|
|
return token{ token::STAREQ, spacing_, loc_ };
|
|
|
|
throw comp_error(loc_, "unmatched multiline comment end ('*/')");
|
|
case '"':
|
|
goto lex_string;
|
|
case '.':
|
|
if (curr < '0' || curr > '9')
|
|
return token{ token::DOT, spacing_, loc_ };
|
|
goto lex_number;
|
|
case '(':
|
|
return token{ token::LPAREN, spacing_, loc_ };
|
|
case ')':
|
|
return token{ token::RPAREN, spacing_, loc_ };
|
|
case '{':
|
|
return token{ token::LBRACE, spacing_, loc_ };
|
|
case '}':
|
|
return token{ token::RBRACE, spacing_, loc_ };
|
|
case '[':
|
|
return token{ token::LBRACKET, spacing_, loc_ };
|
|
case ']':
|
|
return token{ token::RBRACKET, spacing_, loc_ };
|
|
case ',':
|
|
return token{ token::COMMA, spacing_, loc_ };
|
|
case ';':
|
|
return token{ token::SEMICOLON, spacing_, loc_ };
|
|
case ':':
|
|
if (curr != ':')
|
|
return token{ token::COLON, spacing_, loc_ };
|
|
|
|
advance();
|
|
return token{ token::DOUBLECOLON, spacing_, loc_ };
|
|
case '?':
|
|
return token{ token::QMARK, spacing_, loc_ };
|
|
case '=':
|
|
if (curr != '=')
|
|
return token{ token::ASSIGN, spacing_, loc_ };
|
|
|
|
advance();
|
|
return token{ token::EQ, spacing_, loc_ };
|
|
case '+':
|
|
if (curr != '+' && curr != '=')
|
|
return token{ token::PLUS, spacing_, loc_ };
|
|
|
|
advance();
|
|
|
|
if (last == '+')
|
|
return token{ token::INC, spacing_, loc_ };
|
|
|
|
return token{ token::PLUSEQ, spacing_, loc_ };
|
|
case '-':
|
|
if (curr != '-' && curr != '=')
|
|
return token{ token::MINUS, spacing_, loc_ };
|
|
|
|
advance();
|
|
|
|
if (last == '-')
|
|
return token{ token::DEC, spacing_, loc_ };
|
|
|
|
return token{ token::MINUSEQ, spacing_, loc_ };
|
|
case '%':
|
|
if (curr != '=')
|
|
return token{ token::MOD, spacing_, loc_ };
|
|
|
|
advance();
|
|
|
|
return token{ token::MODEQ, spacing_, loc_ };
|
|
case '|':
|
|
if (curr != '|' && curr != '=')
|
|
return token{ token::BITOR, spacing_, loc_ };
|
|
|
|
advance();
|
|
|
|
if (last == '|')
|
|
return token{ token::OR, spacing_, loc_ };
|
|
|
|
return token{ token::BITOREQ, spacing_, loc_ };
|
|
case '&':
|
|
if (curr != '&' && curr != '=' && curr != '"')
|
|
return token{ token::BITAND, spacing_, loc_ };
|
|
|
|
advance();
|
|
|
|
if (last == '&')
|
|
return token{ token::AND, spacing_, loc_ };
|
|
|
|
if (last == '=')
|
|
return token{ token::BITANDEQ, spacing_, loc_ };
|
|
|
|
localize = true;
|
|
goto lex_string;
|
|
case '^':
|
|
if (curr != '=')
|
|
return token{ token::BITEXOR, spacing_, loc_ };
|
|
|
|
advance();
|
|
return token{ token::BITEXOREQ, spacing_, loc_ };
|
|
case '!':
|
|
if (curr != '=')
|
|
return token{ token::BANG, spacing_, loc_ };
|
|
|
|
advance();
|
|
return token{ token::NE, spacing_, loc_ };
|
|
case '~':
|
|
return token{ token::TILDE, spacing_, loc_ };
|
|
case '<':
|
|
if (curr != '<' && curr != '=')
|
|
return token{ token::LT, spacing_, loc_ };
|
|
|
|
advance();
|
|
if (last == '=')
|
|
return token{ token::LE, spacing_, loc_ };
|
|
|
|
if (curr != '=')
|
|
return token{ token::SHL, spacing_, loc_ };
|
|
|
|
advance();
|
|
return token{ token::SHLEQ, spacing_, loc_ };
|
|
case '>':
|
|
if (curr != '>' && curr != '=')
|
|
return token{ token::GT, spacing_, loc_ };
|
|
|
|
advance();
|
|
|
|
if (last == '=')
|
|
return token{ token::GE, spacing_, loc_ };
|
|
|
|
if (curr != '=')
|
|
return token{ token::SHR, spacing_, loc_ };
|
|
|
|
advance();
|
|
return token{ token::SHREQ, spacing_, loc_ };
|
|
default:
|
|
if (last >= '0' && last <= '9')
|
|
goto lex_number;
|
|
else if (last == '_' || (last >= 'A' && last <= 'Z') || (last >= 'a' && last <= 'z'))
|
|
goto lex_name;
|
|
|
|
throw comp_error(loc_, fmt::format("bad token: '{}'", last));
|
|
}
|
|
|
|
lex_string:
|
|
while (true)
|
|
{
|
|
if (reader_.ended())
|
|
throw comp_error(loc_, "unmatched string start ('\"')");
|
|
|
|
if (curr == '"')
|
|
{
|
|
advance();
|
|
break;
|
|
}
|
|
|
|
if (curr == '\n')
|
|
throw comp_error(loc_, "unterminated string literal");
|
|
|
|
if (curr == '\\')
|
|
{
|
|
advance();
|
|
|
|
if (reader_.ended())
|
|
throw comp_error(loc_, "invalid token ('\')");
|
|
|
|
char c = curr;
|
|
switch (curr)
|
|
{
|
|
case 't': c = '\t'; break;
|
|
case 'r': c = '\r'; break;
|
|
case 'n': c = '\n'; break;
|
|
case '"': c = '\"'; break;
|
|
case '\\': c = '\\'; break;
|
|
default: break;
|
|
}
|
|
|
|
push(c);
|
|
}
|
|
else
|
|
push(curr);
|
|
|
|
advance();
|
|
}
|
|
|
|
if (localize)
|
|
return token{ token::ISTRING, spacing_, loc_, std::string{ &buffer_[0], buflen_ } };
|
|
|
|
return token{ token::STRING, spacing_, loc_, std::string{ &buffer_[0], buflen_ } };
|
|
|
|
lex_name:
|
|
push(last);
|
|
|
|
while (true)
|
|
{
|
|
if (reader_.ended())
|
|
break;
|
|
|
|
if (!(curr == '\\' || curr == '_' || (curr > 64 && curr < 91) || (curr > 96 && curr < 123) || (curr > 47 && curr < 58)))
|
|
break;
|
|
|
|
if (curr == '\\')
|
|
{
|
|
if (last == '\\')
|
|
throw comp_error(loc_, "invalid path '\\\\'");
|
|
|
|
path = true;
|
|
push('/');
|
|
}
|
|
else
|
|
push(curr);
|
|
|
|
advance();
|
|
}
|
|
|
|
if (path)
|
|
{
|
|
if (buffer_[buflen_ - 1] == '/')
|
|
throw comp_error(loc_, "invalid path end '\\'");
|
|
|
|
return token{ token::PATH, spacing_, loc_, ctx_->make_token(std::string_view{ &buffer_[0], buflen_ }) };
|
|
}
|
|
|
|
return token{ token::NAME, spacing_, loc_, std::string{ &buffer_[0], buflen_ } };
|
|
|
|
lex_number:
|
|
if (last == '.' || last != '0' || (last == '0' && (curr != 'o' && curr != 'b' && curr != 'x')))
|
|
{
|
|
push(last);
|
|
|
|
auto dot = last == '.' ? 1 : 0;
|
|
auto flt = 0;
|
|
|
|
while (true)
|
|
{
|
|
if (reader_.ended())
|
|
break;
|
|
|
|
if (curr == '\'' && (last == '\'' || last == 'f' || last == '.'))
|
|
throw comp_error(loc_, "invalid number literal");
|
|
|
|
if ((curr == '.' || curr == 'f') && last == '\'')
|
|
throw comp_error(loc_, "invalid number literal");
|
|
|
|
if (curr == '\'')
|
|
{
|
|
advance();
|
|
continue;
|
|
}
|
|
|
|
if (curr == 'f')
|
|
flt++;
|
|
else if (curr == '.')
|
|
dot++;
|
|
else if (!(curr > 47 && curr < 58))
|
|
break;
|
|
|
|
push(curr);
|
|
advance();
|
|
}
|
|
|
|
if (last == '\'')
|
|
throw comp_error(loc_, "invalid number literal");
|
|
|
|
if (dot > 1 || flt > 1 || (flt && buffer_[buflen_ - 1] != 'f'))
|
|
throw comp_error(loc_, "invalid number literal");
|
|
|
|
if (dot || flt)
|
|
return token{ token::FLT, spacing_, loc_, std::string{ &buffer_[0], buflen_ } };
|
|
|
|
return token{ token::INT, spacing_, loc_, std::string{ &buffer_[0], buflen_ } };
|
|
}
|
|
else if (curr == 'o')
|
|
{
|
|
advance();
|
|
|
|
while (true)
|
|
{
|
|
if (reader_.ended())
|
|
break;
|
|
|
|
if ((curr == '\'' && (last == '\'' || last == 'o')) || (curr == 'o' && last == '\''))
|
|
throw comp_error(loc_, "invalid octal literal");
|
|
|
|
if (curr == '\'')
|
|
{
|
|
advance();
|
|
continue;
|
|
}
|
|
|
|
if (!(curr > 47 && curr < 56))
|
|
break;
|
|
|
|
push(curr);
|
|
advance();
|
|
}
|
|
|
|
if (last == '\'' || buflen_ <= 0)
|
|
throw comp_error(loc_, "invalid octal literal");
|
|
|
|
push('\0');
|
|
|
|
return token{ token::INT, spacing_, loc_, utils::string::oct_to_dec(&buffer_[0]) };
|
|
}
|
|
else if (curr == 'b')
|
|
{
|
|
push(last);
|
|
push(curr);
|
|
advance();
|
|
|
|
while (true)
|
|
{
|
|
if (reader_.ended())
|
|
break;
|
|
|
|
if ((curr == '\'' && (last == '\'' || last == 'b')) || (curr == 'b' && last == '\''))
|
|
throw comp_error(loc_, "invalid binary literal");
|
|
|
|
if (curr == '\'')
|
|
{
|
|
advance();
|
|
continue;
|
|
}
|
|
|
|
if (curr != '0' && curr != '1')
|
|
break;
|
|
|
|
push(curr);
|
|
advance();
|
|
}
|
|
|
|
if (last == '\'' || buflen_ < 3)
|
|
throw comp_error(loc_, "invalid binary literal");
|
|
|
|
push('\0');
|
|
|
|
return token{ token::INT, spacing_, loc_, utils::string::bin_to_dec(&buffer_[0]) };
|
|
}
|
|
else if (curr == 'x')
|
|
{
|
|
push(last);
|
|
push(curr);
|
|
advance();
|
|
|
|
while (true)
|
|
{
|
|
if (reader_.ended())
|
|
break;
|
|
|
|
if ((curr == '\'' && (last == '\'' || last == 'x')) || (curr == 'x' && last == '\''))
|
|
throw comp_error(loc_, "invalid hexadecimal literal");
|
|
|
|
if (curr == '\'')
|
|
{
|
|
advance();
|
|
continue;
|
|
}
|
|
|
|
if (!((curr > 47 && curr < 58) || (curr > 64 && curr < 71) || (curr > 96 && curr < 103)))
|
|
break;
|
|
|
|
push(curr);
|
|
advance();
|
|
}
|
|
|
|
if (last == '\'' || buflen_ < 3)
|
|
throw comp_error(loc_, "invalid hexadecimal literal");
|
|
|
|
push('\0');
|
|
|
|
return token{ token::INT, spacing_, loc_, utils::string::hex_to_dec(&buffer_[0]) };
|
|
}
|
|
|
|
throw error("UNEXPECTED LEXER INTERNAL ERROR");
|
|
}
|
|
}
|
|
|
|
auto lexer::push(char c) -> void
|
|
{
|
|
if (buflen_ >= 0x1000)
|
|
throw error("lexer: max literal size exceeded");
|
|
|
|
buffer_[buflen_++] = c;
|
|
}
|
|
|
|
auto lexer::advance() -> void
|
|
{
|
|
reader_.advance();
|
|
loc_.end.column++;
|
|
|
|
if (reader_.curr_byte == '\\') [[unlikely]]
|
|
linewrap();
|
|
}
|
|
|
|
auto lexer::linewrap() -> void
|
|
{
|
|
while (reader_.curr_byte == '\\')
|
|
{
|
|
if (reader_.available == 1)
|
|
throw comp_error(loc_, "invalid token ('\\')");
|
|
|
|
if (reader_.buffer_pos[1] != '\r' && reader_.buffer_pos[1] != '\n')
|
|
break;
|
|
|
|
if (reader_.buffer_pos[1] == '\r')
|
|
{
|
|
if (reader_.available <= 3 || reader_.buffer_pos[2] != '\n')
|
|
throw comp_error(loc_, "invalid token ('\\')");
|
|
|
|
reader_.buffer_pos += 3;
|
|
reader_.available -= 3;
|
|
}
|
|
|
|
if ((reader_.buffer_pos[1] == '\n'))
|
|
{
|
|
if (reader_.available == 2)
|
|
throw comp_error(loc_, "invalid token ('\\')");
|
|
|
|
reader_.buffer_pos += 2;
|
|
reader_.available -= 2;
|
|
}
|
|
|
|
reader_.curr_byte = reader_.available ? *reader_.buffer_pos : 0;
|
|
|
|
loc_.lines();
|
|
loc_.step();
|
|
}
|
|
}
|
|
|
|
} // namespace xsk::gsc
|