Pelzini

This is the code documentation for the Pelzini project

source of /processor/javascript_lexer.php

Contains the JavascriptLexer class
  1. <?php
  2. /*
  3. Copyright 2008 Josh Heidenreich
  4.  
  5. This file is part of Pelzini.
  6.  
  7. Pelzini is free software: you can redistribute it and/or modify
  8. it under the terms of the GNU General Public License as published by
  9. the Free Software Foundation, either version 3 of the License, or
  10. (at your option) any later version.
  11.  
  12. Pelzini is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. GNU General Public License for more details.
  16.  
  17. You should have received a copy of the GNU General Public License
  18. along with Pelzini. If not, see <http://www.gnu.org/licenses/>.
  19. */
  20.  
  21.  
  22. /**
  23.  * Contains the {@link JavascriptLexer} class
  24.  *
  25.  * @package Parsers
  26.  * @author Josh
  27.  * @since 0.2
  28.  **/
  29.  
  30. /**
  31.  * Tokenises a javascript file.
  32.  **/
  33. class JavascriptLexer
  34. {
  35. // Should this be common for all lexers?
  36. private $single_characters = array(
  37. '(' => TOKEN_OPEN_NORMAL_BRACKET,
  38. ')' => TOKEN_CLOSE_NORMAL_BRACKET,
  39. '{' => TOKEN_OPEN_CURLY_BRACKET,
  40. '}' => TOKEN_CLOSE_CURLY_BRACKET,
  41. '[' => TOKEN_OPEN_SQUARE_BRACKET,
  42. ']' => TOKEN_CLOSE_SQUARE_BRACKET,
  43. '=' => TOKEN_EQUALS,
  44. '.' => TOKEN_PERIOD,
  45. ',' => TOKEN_COMMA,
  46. ';' => TOKEN_SEMICOLON
  47. );
  48.  
  49. private $reserved_words = array(
  50. 'break', 'else', 'new', 'var', 'case', 'finally', 'return', 'void', 'catch',
  51. 'for', 'switch', 'while', 'do', 'continue', 'function', 'this', 'with', 'default', 'if', 'throw',
  52. 'delete', 'in', 'try', 'instanceof', 'typeof',
  53.  
  54. 'abstract', 'enum', 'int', 'short', 'boolean', 'export', 'interface', 'static', 'byte', 'extends',
  55. 'long', 'super', 'char', 'final', 'native', 'synchronized', 'class', 'float', 'package', 'throws',
  56. 'const', 'goto', 'private', 'transient', 'debugger', 'implements', 'protected', 'volatile'
  57. );
  58.  
  59. private $reserved_values = array('null', 'true', 'false');
  60.  
  61.  
  62. /**
  63.   * Resets any state variables used by this class back to their initial state
  64.   **/
  65. public function resetState()
  66. {}
  67.  
  68.  
  69. /**
  70.   * Should return an array of zero or more Token objects
  71.   **/
  72. public function process($source)
  73. {
  74. $offset = 0;
  75. $length = strlen($source);
  76. $tokens = array();
  77. $junk = array();
  78.  
  79. Token::setCurrLineNum(1);
  80. while ($offset < $length) {
  81.  
  82. if (preg_match('/\G(\n|\r|\n\r)/', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  83. Token::setIncrLineNum();
  84. $offset = $matches[0][1] + strlen($matches[0][0]);
  85. //echo "LINE..."; flush();
  86. continue;
  87. }
  88.  
  89. // Firstly, look for single character tokens
  90. // Should this be common for all lexers?
  91. foreach ($this->single_characters as $char => $token_type) {
  92. if ($source[$offset] == $char) {
  93. $tokens[] = new Token($token_type, $char);
  94. $offset++;
  95. continue 2;
  96. }
  97. }
  98.  
  99. // Now use regular expressions to find various other tokens
  100. // If one is found, add it to the list and move on
  101.  
  102. // Search for a Docblock comment
  103. if (preg_match('/\G\/\*\*(.+?)\*\//s', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  104. $tokens[] = new Token(TOKEN_DOCBLOCK, $matches[0][0]);
  105. $offset = $matches[0][1] + strlen($matches[0][0]);
  106. Token::setIncrLineNum(preg_match_all('/\n|\r|\n\r/', $matches[0][0], $junk));
  107. continue;
  108. }
  109.  
  110. // Search for a regular /* */ comment
  111. if (preg_match('/\G\/\*(.+?)\*\//s', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  112. $tokens[] = new Token(TOKEN_COMMENT, $matches[0][0]);
  113. $offset = $matches[0][1] + strlen($matches[0][0]);
  114. Token::setIncrLineNum(preg_match_all('/\n|\r|\n\r/', $matches[0][0], $junk));
  115. continue;
  116. }
  117.  
  118. // Search for a // comment
  119. if (preg_match('/\G\/\/.*\n/', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  120. $tokens[] = new Token(TOKEN_COMMENT, rtrim($matches[0][0]));
  121. $offset = $matches[0][1] + strlen($matches[0][0]);
  122. Token::setIncrLineNum();
  123. continue;
  124. }
  125.  
  126. // Search for a double-quoted string
  127. if (preg_match('/\G"([^\"]|\.)*"/i', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  128. $tokens[] = new Token(TOKEN_STRING, $matches[0][0]);
  129. $offset = $matches[0][1] + strlen($matches[0][0]);
  130. Token::setIncrLineNum(preg_match_all('/\n|\r|\n\r/', $matches[0][0], $junk));
  131. continue;
  132. }
  133.  
  134. // Search for a single-quoted string
  135. if (preg_match('/\G\'([^\\\']|\.)*\'/i', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  136. $tokens[] = new Token(TOKEN_STRING, $matches[0][0]);
  137. $offset = $matches[0][1] + strlen($matches[0][0]);
  138. Token::setIncrLineNum(preg_match_all('/\n|\r|\n\r/', $matches[0][0], $junk));
  139. continue;
  140. }
  141.  
  142. // Search for reserved words. This list includes the future reserved words
  143. foreach ($this->reserved_words as $word) {
  144. if (preg_match('/\G' . $word . '/i', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  145.  
  146. // Some reserved words get a specific token - basiclly anything that is understood by the analyser
  147. // everything else just gets the generic 'reserved word' token.
  148. switch ($word) {
  149. case 'function':
  150. $tokens[] = new Token(TOKEN_FUNCTION);
  151. break;
  152.  
  153. default:
  154. $tokens[] = new Token(TOKEN_RESERVED_WORD, $word);
  155. break;
  156. }
  157.  
  158. $offset = $matches[0][1] + strlen($matches[0][0]);
  159. continue;
  160. }
  161. }
  162.  
  163. // Search for reserved values
  164. foreach ($this->reserved_values as $value) {
  165. if (preg_match('/\G' . $value . '/i', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  166. $tokens[] = new Token(TOKEN_RESERVED_VALUE, $value);
  167. $offset = $matches[0][1] + strlen($matches[0][0]);
  168. continue;
  169. }
  170. }
  171.  
  172. // Search for a number
  173. $number_expressions = array(
  174. '/\G0x[0-9A-F]+/i',
  175. '/\G[0-9]+/'
  176. );
  177. foreach ($number_expressions as $expression) {
  178. if (preg_match($expression, $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  179. $tokens[] = new Token(TOKEN_NUMBER, $matches[0][0]);
  180. $offset = $matches[0][1] + strlen($matches[0][0]);
  181. continue;
  182. }
  183. }
  184.  
  185. // Search for an indentifier
  186. if (preg_match('/\G[a-z$_][a-z0-9$_]*/i', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  187. $tokens[] = new Token(TOKEN_IDENTIFIER, $matches[0][0]);
  188. $offset = $matches[0][1] + strlen($matches[0][0]);
  189. continue;
  190. }
  191.  
  192. $offset++;
  193. }
  194.  
  195. return $tokens;
  196. }
  197.  
  198.  
  199. }
  200.  
  201.  
  202. ?>
  203.