Pelzini

This is the code documentation for the Pelzini project

source of /processor/c_lexer.php

Contains the CLexer class
  1. <?php
  2. /*
  3. Copyright 2008 Josh Heidenreich
  4.  
  5. This file is part of Pelzini.
  6.  
  7. Pelzini is free software: you can redistribute it and/or modify
  8. it under the terms of the GNU General Public License as published by
  9. the Free Software Foundation, either version 3 of the License, or
  10. (at your option) any later version.
  11.  
  12. Pelzini is distributed in the hope that it will be useful,
  13. but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. GNU General Public License for more details.
  16.  
  17. You should have received a copy of the GNU General Public License
  18. along with Pelzini. If not, see <http://www.gnu.org/licenses/>.
  19. */
  20.  
  21.  
  22. /**
  23.  * Contains the {@link CLexer} class
  24.  *
  25.  * @package Parsers
  26.  * @author Josh
  27.  * @since 0.2
  28.  **/
  29.  
  30. /**
  31.  * Tokenises a C file.
  32.  **/
  33. class CLexer
  34. {
  35. // Should this be common for all lexers?
  36. private $single_characters = array(
  37. '(' => TOKEN_OPEN_NORMAL_BRACKET,
  38. ')' => TOKEN_CLOSE_NORMAL_BRACKET,
  39. '{' => TOKEN_OPEN_CURLY_BRACKET,
  40. '}' => TOKEN_CLOSE_CURLY_BRACKET,
  41. '[' => TOKEN_OPEN_SQUARE_BRACKET,
  42. ']' => TOKEN_CLOSE_SQUARE_BRACKET,
  43. '=' => TOKEN_EQUALS,
  44. '.' => TOKEN_PERIOD,
  45. ',' => TOKEN_COMMA,
  46. ';' => TOKEN_SEMICOLON,
  47. '*' => TOKEN_ASTERIX
  48. );
  49.  
  50. private $reserved_words = array(
  51. 'auto', 'break', 'case', 'continue', 'default', 'do', 'else', 'enum', 'extern',
  52. 'for', 'goto', 'if', 'inline', 'register', 'restrict', 'return', 'sizeof', 'static',
  53. 'struct', 'switch', 'typedef', 'union', 'volatile', 'while'
  54. );
  55.  
  56. private $token_words = array(
  57. 'const' => TOKEN_CONST,
  58. );
  59.  
  60. private $reserved_values = array('NULL');
  61.  
  62.  
  63. /**
  64.   * Resets any state variables used by this class back to their initial state
  65.   **/
  66. public function resetState()
  67. {}
  68.  
  69.  
  70. /**
  71.   * Should return an array of zero or more Token objects
  72.   **/
  73. public function process($source)
  74. {
  75. $offset = 0;
  76. $length = strlen($source);
  77. $tokens = array();
  78.  
  79. // strip comments
  80. $source = preg_replace('!/\*[^*].*?\*/!s', '', $source);
  81.  
  82. $curr_line = 1;
  83. while ($offset < $length) {
  84.  
  85. if (preg_match('/\G(\n|\r|\n\r)/', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  86. $curr_line++;
  87. $offset = $matches[0][1] + strlen($matches[0][0]);
  88. //echo "LINE..."; flush();
  89. continue;
  90. }
  91.  
  92. // Firstly, look for single character tokens
  93. // Should this be common for all lexers?
  94. foreach ($this->single_characters as $char => $token_type) {
  95. if ($source[$offset] == $char) {
  96. $tokens[] = new Token($token_type, $char);
  97. $offset++;
  98. //echo "CHAR..."; flush();
  99. continue 2;
  100. }
  101. }
  102.  
  103. // Now use regular expressions to find various other tokens
  104. // If one is found, add it to the list and move on
  105.  
  106. // Search for a preprocessor directive
  107. if (preg_match('/\G(#[a-z]+.*?)\n/s', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  108. $tokens[] = new Token(TOKEN_C_PREPROCESSOR, $matches[0][0]);
  109. $offset = $matches[0][1] + strlen($matches[0][0]);
  110. //echo "PREP..."; flush();
  111. continue;
  112. }
  113.  
  114. // Search for a Docblock comment
  115. if (preg_match('/\G\/\*\*(.+?)\*\//s', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  116. $tokens[] = new Token(TOKEN_DOCBLOCK, $matches[0][0]);
  117. $offset = $matches[0][1] + strlen($matches[0][0]);
  118. //echo "DOCB..."; flush();
  119. continue;
  120. }
  121.  
  122. // Search for a regular /* */ comment
  123. if (preg_match('/\G\/\*(.+?)\*\//s', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  124. $tokens[] = new Token(TOKEN_COMMENT, $matches[0][0]);
  125. $offset = $matches[0][1] + strlen($matches[0][0]);
  126. //echo "COMM..."; flush();
  127. continue;
  128. }
  129.  
  130. // Search for a // comment
  131. if (preg_match('/\G\/\/.*\n/', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  132. $tokens[] = new Token(TOKEN_COMMENT, rtrim($matches[0][0]));
  133. $offset = $matches[0][1] + strlen($matches[0][0]);
  134. //echo "DBLS..."; flush();
  135. continue;
  136. }
  137.  
  138. // Search for a double-quoted string
  139. if (preg_match('/\G"([^\"]|\.)*"/i', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  140. $tokens[] = new Token(TOKEN_STRING, $matches[0][0]);
  141. $offset = $matches[0][1] + strlen($matches[0][0]);
  142. //echo "STRD..."; flush();
  143. continue;
  144. }
  145.  
  146. // Search for a single-quoted string
  147. if (preg_match('/\G\'([^\\\']|\.)*\'/i', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  148. $tokens[] = new Token(TOKEN_STRING, $matches[0][0]);
  149. $offset = $matches[0][1] + strlen($matches[0][0]);
  150. //echo "STRS..."; flush();
  151. continue;
  152. }
  153.  
  154. // Search for reserved words. This list includes the future reserved words
  155. foreach ($this->reserved_words as $word) {
  156. if (preg_match('/\G' . $word . '/i', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  157. $tokens[] = new Token(TOKEN_RESERVED_WORD, $word);
  158. $offset = $matches[0][1] + strlen($matches[0][0]);
  159. //echo "RESW..."; flush();
  160. continue;
  161. }
  162. }
  163.  
  164. // Search for reserved values
  165. foreach ($this->reserved_values as $value) {
  166. if (preg_match('/\G' . $value . '/i', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  167. $tokens[] = new Token(TOKEN_RESERVED_VALUE, $value);
  168. $offset = $matches[0][1] + strlen($matches[0][0]);
  169. //echo "RESV..."; flush();
  170. continue;
  171. }
  172. }
  173.  
  174. // Search for token words - reserved words with meaning
  175. foreach ($this->token_words as $word => $token_type) {
  176. if (preg_match('/\G' . $word . '/i', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  177. $tokens[] = new Token($token_type, $word);
  178. $offset = $matches[0][1] + strlen($matches[0][0]);
  179. //echo "TOKW..."; flush();
  180. continue;
  181. }
  182. }
  183.  
  184. // Search for a number
  185. $number_expressions = array(
  186. '/\G0x[0-9A-F]+/i',
  187. '/\G[0-9]+/'
  188. );
  189. foreach ($number_expressions as $expression) {
  190. if (preg_match($expression, $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  191. $tokens[] = new Token(TOKEN_NUMBER, $matches[0][0]);
  192. $offset = $matches[0][1] + strlen($matches[0][0]);
  193. //echo "NUMB..."; flush();
  194. continue;
  195. }
  196. }
  197.  
  198. // Search for an indentifier
  199. if (preg_match('/\G[a-z$_][a-z0-9$_]*/i', $source, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  200. $tokens[] = new Token(TOKEN_IDENTIFIER, $matches[0][0]);
  201. $offset = $matches[0][1] + strlen($matches[0][0]);
  202. //echo "IDEN..."; flush();
  203. continue;
  204. }
  205.  
  206. //echo "OTHR..."; flush();
  207. $offset++;
  208. }
  209.  
  210. //echo "\n"; flush();
  211.  
  212. return $tokens;
  213. }
  214.  
  215.  
  216. }
  217.  
  218.  
  219. ?>
  220.