Później zmienię wyrażenia regularne na co innego, a obecnie zajmuję się analizą wyodrębnionych jednostek leksykalnych. Czy da się napisać kompilator w ten sposób, czy takie podejście zaprowadzi mnie w ciemny kąt?
<?php namespace ELECTIONS\Parser;
const TOKEN_EPSILON = 0;
const TOKEN_STRING = 1;
const TOKEN_COMMENT = 2;
const TOKEN_OBIEKT = 3;
const TOKEN_MAPA = 4;
const TOKEN_EVENT = 5;
const TOKEN_DESCRIPTION = 6;
const TOKEN_TALK = 7;
const TOKEN_END = 8;
const TOKEN_IF = 9;
const TOKEN_ELSE = 10;
const TOKEN_FOR = 11;
const TOKEN_WHILE = 12;
const TOKEN_LEFT_STAR = 13;
const TOKEN_RIGHT_STAR = 14;
const TOKEN_LEFT_PARENTHESIS = 15;
const TOKEN_RIGHT_PARENTHESIS = 16;
const TOKEN_LEFT_BRACKET = 17;
const TOKEN_RIGHT_BRACKET = 18;
const TOKEN_PROPERTIES = 19;
const TOKEN_MAIN = 40;
const TOKEN_INT = 20;
const TOKEN_FLOAT = 21;
const TOKEN_BOOL = 22;
const TOKEN_AT = 23;
const TOKEN_RETURN = 24;
const TOKEN_ASSIGN = 25;
const TOKEN_COLON = 26;
const TOKEN_SEMICOLON = 27;
const TOKEN_DOT = 28;
const TOKEN_PLUS_ASSIGN = 29;
const TOKEN_MINUS_ASSIGN = 30;
const TOKEN_DATA_TYPE = 39;
const TOKEN_MACRO = 31;
const TOKEN_NAME = 32;
const TOKEN_COMMA = 33;
const TOKEN_LEFT_BRACE = 34;
const TOKEN_RIGHT_BRACE = 35;
const TOKEN_GREATER_THAN = 36;
const TOKEN_LESS_THAN = 37;
const TOKEN_SIGN = 38;
/** Base class for tokens */
class Token
{
public $type;
public $text;
public function __construct($token, $sequence)
{
$this->type = $token;
$this->text = $sequence;
}
}
/** Parser */
class Parser
{
/**
* Tokens returned by lexer
* @var Token[]
*/
private $tokens = [];
/**
*
* @var Obiekt[]
*/
private $obiekts = [];
/**
*
* @var Mapa[]
*/
private $mapas = [];
/**
*
* @var Talk[]
*/
private $talks = [];
/**
* Parse ELECTIONS code
* @param string $input source code as string
* @return string todo jeszcze nie wiem
*/
public function parse($input)
{
$lexer = new Lexer;
$lexer->add('"(?:[^"\\\\]|\\.)*"', TOKEN_STRING);
$lexer->add('//.*?\n', TOKEN_COMMENT);
$lexer->add('OBIEKT', TOKEN_OBIEKT);
$lexer->add('MAPA', TOKEN_MAPA);
$lexer->add('EVENT', TOKEN_EVENT);
$lexer->add('DESCRIPTION', TOKEN_DESCRIPTION);
$lexer->add('TALK', TOKEN_TALK);
$lexer->add('END', TOKEN_END);
$lexer->add('if', TOKEN_IF);
$lexer->add('else', TOKEN_ELSE);
$lexer->add('for', TOKEN_FOR);
$lexer->add('while', TOKEN_WHILE);
$lexer->add('string|int|float|bool', TOKEN_DATA_TYPE);
$lexer->add('\\(\\*', TOKEN_LEFT_STAR);
$lexer->add('\\*\\)', TOKEN_RIGHT_STAR);
$lexer->add('\\(', TOKEN_LEFT_PARENTHESIS);
$lexer->add('\\)', TOKEN_RIGHT_PARENTHESIS);
$lexer->add('PROPERTIES', TOKEN_PROPERTIES);
$lexer->add('[0-9]+\\.[0-9]+', TOKEN_FLOAT);
$lexer->add('0x[0-9A-F]+', TOKEN_INT);
$lexer->add('[0-9]+', TOKEN_INT);
$lexer->add('true|false', TOKEN_BOOL);
$lexer->add('@', TOKEN_AT);
$lexer->add('RETURN', TOKEN_RETURN);
$lexer->add('=', TOKEN_ASSIGN);
$lexer->add(':', TOKEN_COLON);
$lexer->add(';', TOKEN_SEMICOLON);
$lexer->add('\\.', TOKEN_DOT);
$lexer->add('\\+=', TOKEN_PLUS_ASSIGN);
$lexer->add('[a-zA-Z_$][a-zA-Z0-9_$]*', TOKEN_NAME); //MILESTONE
$lexer->add('\\#', TOKEN_MACRO);
$lexer->add(',', TOKEN_MACRO);
$lexer->add('{', TOKEN_LEFT_BRACE);
$lexer->add('}', TOKEN_RIGHT_BRACE);
$lexer->add('<', TOKEN_LESS_THAN);
$lexer->add('>', TOKEN_GREATER_THAN);
$lexer->add('-', TOKEN_SIGN);
$lexer->add('\\+', TOKEN_SIGN);
$this->tokens = $lexer->tokenize($input);
$this->obiekts = []; //todo dopisac wyzewj
$this->mapas = [];
$token = reset($this->tokens);
while($token)
{
switch($token->type)
{
case TOKEN_OBIEKT:
$this->parseObiekt();
break;
case TOKEN_MAPA:
$this->parseMapa();
break;
case TOKEN_MAIN:
$this->parseMain();
break;
default:
throw new \Exception('Unexpected '.$token->text);
}
$token = $this->next();
}
var_dump($this->obiekts, $this->mapas, $this->talks);
}
/**
*
* @param type $type
* @return Token
* @throws \Exception
*/
private function next($type = null)
{
if(!$token = next($this->tokens))
{
throw new \Exception('No next token');
}
if(!empty($type) && $token->type !== $type)
{
throw new \Exception(sprintf('Unexpected %s expecting %s', $token->text, $this->getTokenName($type)));
}
return $token;
}
private function prev($type = null)
{
if(!$token = prev($this->tokens))
{
throw new \Exception('No previous token');
}
if(!empty($type) && $token->type !== $type)
{
throw new \Exception(sprintf('Unexpected %s expecting %s', $token->text, $this->getTokenName($type)));
}
return $token;
}
//todo do wywalenia inaczej
private function getTokenName($type = null)
{
if($type instanceof Token)
{
$type = $token->type;
}
elseif(is_null($type))
{
$type = current($this->tokens)->type;
}
elseif(!is_numeric($type))
{
return $type;
}
foreach(get_defined_constants(TRUE)['user'] as $name=>$value)
{
if($value === $type && 0 === strpos($name, 'elections\parser\TOKEN_'))
{
return substr($name, 17);
}
}
throw new \Exception(sprintf('Token %d does not exist', $type));
}
private function skip($type = null)
{
if(empty($type))
{
return end($this->tokens);
}
while($token = $this->next())
{
if($token->type === $type)
{
return $token;
}
}
}
private function parseObiekt()
{
$this->obiekts[] = $this->obiekt = new \Obiekt; //todo do klasy
$this->obiekt->name = $this->next(TOKEN_NAME)->text;
$this->next(TOKEN_LEFT_STAR);
while(true)
{
switch($this->next()->type)
{
case TOKEN_DESCRIPTION:
$this->parseDescription();
break;
case TOKEN_PROPERTIES:
$this->parseProperties();
break;
case TOKEN_EVENT:
$this->parseEvent();
break;
case TOKEN_RIGHT_STAR:
return true;
default:
throw new \Exception('Unexpected '.$this->getTokenName().' in '.$this->obiekt->name.' definition.');
}
}
}
private function parseDescription()
{
$this->next(TOKEN_ASSIGN);
$token = $this->next();
switch($token->type)
{
case TOKEN_STRING:
$this->obiekt->description = $token->text;
break;
case TOKEN_MACRO:
$this->obiekt->description = $this->parseMacro();
break;
default:
throw new \Exception('Unexpected '.$this->getTokenName().' expecting string, number or macro');
}
$this->next(TOKEN_SEMICOLON);
}
private function parseProperties()
{
$this->next(TOKEN_COLON);
$next = $this->next();
while($next->type === TOKEN_NAME)
{
$this->next(TOKEN_ASSIGN);
$this->obiekt->addProperty($next->text, $this->parseExpression());
$this->next(TOKEN_SEMICOLON);
$next = $this->next();
}
$this->prev();
}
private function parseExpression()
{
$token = $this->next();
switch($token->type)
{
case TOKEN_INT:
break;//todo tu kontynuj
}
}
private function parseEvent()
{
$token = $this->next();
if($token->type === TOKEN_DATA_TYPE)
{
$returnType = $token->text;
$name = $this->next(TOKEN_NAME)->text;
}
elseif($token->type === TOKEN_NAME)
{
$returnType = 'void';
$name = $token->text;
}
else
{
throw new \Exception('Unexpected '.$this->getTokenName().' expecting event name or int|void|float|pointer');
}
$this->obiekt->addEvent($name,null);
$this->next(TOKEN_LEFT_PARENTHESIS);
$args = [];
while(true)
{
$token = $this->next();
switch($token->type)
{
case TOKEN_RIGHT_PARENTHESIS:
break 2;
case TOKEN_NAME:
$name = $token->text;
$this->next(TOKEN_COLON);
$type = $this->next(TOKEN_DATA_TYPE);
break;
default:
throw new \Exception('Unexpected '.$token->text.' expecting argument name or )');
}
}
$this->next(TOKEN_COLON);
$this->next(TOKEN_LEFT_BRACE);
$this->parseCommands();
$this->next(TOKEN_RIGHT_BRACE); //TODO: finish here, commands not parsed
}
private function parseMacro()
{
$token = $this->next(TOKEN_NAME);
$this->skip(TOKEN_LEFT_PARENTHESIS);
$this->skip(TOKEN_RIGHT_PARENTHESIS);
}
private function parseCommands()
{
//TODO: I KNOW I MUST BUILD PARSE TREE OR NOT BUT..............
$token = $this->next();
while(true)
{
switch($token->type)
{
case TOKEN_NAME:
$this->parseFunctionCall();
break;
default:
$this->prev();
return;
}
}
}
private function parseFunctionCall()
{
$this->next(TOKEN_LEFT_PARENTHESIS);
$this->next(TOKEN_RIGHT_PARENTHESIS);
}
/**
* Parse ELECTIONS code from file
* @param string $path path to source file
* @return string todo jeszcze nie wiem
*/
public function parseFile($path)
{
return $this->parse(file_get_contents($path));
}
}
/** Lexer */
class Lexer
{
/**
* Regular expressions of tokens to match
* @var Array[string]
*/
private $regexs = [];
/**
* Adds new token to recognize in source code
* @param string $regex PCRE regular expression without delimiter
* @param int $token token type (use constant)
*/
public function add($regex, $token)
{
$this->regexs['~'.$regex.'~iA'] = $token;
}
/**
* Divides input string into tokens
* @param string $input
* @return Token[] found tokens
* @throws Exception if an input character matches no defined token
*/
public function tokenize($input)
{
$tokens = [];
$bom = pack('H*','EFBBBF');
$input = preg_replace("/^$bom/", '', trim($input));
while($input !== '')
{
$match = false;
foreach($this->regexs as $regex=>$token)
{
if(preg_match($regex, $input, $matches))
{
$match = true;
$input = trim(preg_replace($regex, '', $input, 1));
if($token === TOKEN_COMMENT)
{
break;
}
elseif($token === TOKEN_STRING)
{
$matches[0] = substr($matches[0], 1, -1);
}
$tokens[] = new Token($token, $matches[0]);
break;
}
}
if(!$match)
{
throw new \Exception(sprintf('Invalid character %s', $input));
}
}
return $tokens;
}
}
W przypadku wyrażeń z priorytetami operatorów i zagnieżdżeniami nie obejdzie się bez hierarchii. Oto przykład:
if(lokal_wyborczy.skrzynka.pelna == true)
{
if(1 > 0 + 0x0)
{
ObliczWynik( 50 + 2 * 2 ^ (2 + 2) + ObliczWynik(1) )
}
}
Czy wystarczy budować drzewka tylko do wyrażeń czy do całego kodu? W sieci nie ma zbyt wiele przykładów. Węzły drzewa to pojedyncze jednostki leksykalne (TokenIntNode, TokenPlusNode, TokenIfNode) czy operacje (MultiplicationNode, AdditionNode, IfNode)?