forked from aramk/crayon-syntax-highlighter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crayon_parser.class.php
265 lines (241 loc) · 9.5 KB
/
crayon_parser.class.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
<?php
require_once ('global.php');
require_once (CRAYON_LANGS_PHP);
/* Manages parsing the syntax for any given language, constructing the regex, and validating the
elements. */
class CrayonParser {
// Properties and Constants ===============================================
const CASE_INSENSITIVE = 'CASE_INSENSITIVE';
const MULTI_LINE = 'MULTI_LINE';
const SINGLE_LINE = 'SINGLE_LINE';
const ALLOW_MIXED = 'ALLOW_MIXED';
//const NO_END_TAG = '(?![^<]*>)'; // No longer used
const HTML_CHAR = 'HTML_CHAR';
const HTML_CHAR_REGEX = '<|>|(&([\w-]+);?)|[ \t]+';
const CRAYON_ELEMENT = 'CRAYON_ELEMENT';
const CRAYON_ELEMENT_REGEX = '\{\{crayon-internal:[^\}]*\}\}';
const CRAYON_ELEMENT_REGEX_CAPTURE = '\{\{crayon-internal:([^\}]*)\}\}';
private static $modes = array(self::CASE_INSENSITIVE => TRUE, self::MULTI_LINE => TRUE, self::SINGLE_LINE => TRUE, self::ALLOW_MIXED => TRUE);
// Methods ================================================================
private function __construct() {}
/**
* Parse all languages stored in CrayonLangs.
* Avoid using this unless you must list the details in language files for all languages.
* @return array Array of all loaded CrayonLangs.
*/
public static function parse_all() {
$langs = CrayonResources::langs()->get();
if (empty($langs)) {
return FALSE;
}
foreach ($langs as $lang) {
self::parse($lang->id());
}
return $langs;
}
/* Read a syntax file and parse the regex rules within it, this may require several other
files containing lists of keywords and such to be read. Updates the parsed elements and
regex in the CrayonLang with the given $id. */
public static function parse($id) {
// Verify the language is loaded and has not been parsed before
if ( !($lang = CrayonResources::langs()->get($id)) ) {
CrayonLog::syslog("The language with id '$id' was not loaded and could not be parsed.");
return FALSE;
} else if ($lang->is_parsed()) {
return;
}
// Read language file
$path = CrayonResources::langs()->path($id);
CrayonLog::debug('Parsing language ' . $path);
if ( ($file = CrayonUtil::lines($path, 'wcs')) === FALSE ) {
CrayonLog::debug('Parsing failed ' . $path);
return FALSE;
}
// Extract the language name
$name_pattern = '#^[ \t]*name[ \t]+([^\r\n]+)[ \t]*#mi';
preg_match($name_pattern, $file, $name);
if (count($name) > 1) {
$name = $name[1];
$lang->name($name);
$file = preg_replace($name_pattern, '', $file);
} else {
$name = $lang->id();
}
// Extract the language version
$version_pattern = '#^[ \t]*version[ \t]+([^\r\n]+)[ \t]*#mi';
preg_match($version_pattern, $file, $version);
if (count($version) > 1) {
$version = $version[1];
$lang->version($version);
$file = preg_replace($version_pattern, '', $file);
}
// Extract the modes
$mode_pattern = '#^[ \t]*(' . implode('|', array_keys(self::$modes)) . ')[ \t]+(?:=[ \t]*)?([^\r\n]+)[ \t]*#mi';
preg_match_all($mode_pattern, $file, $mode_matches);
if (count($mode_matches) == 3) {
for ($i = 0; $i < count($mode_matches[0]); $i++) {
$lang->mode($mode_matches[1][$i], $mode_matches[2][$i]);
}
$file = preg_replace($mode_pattern, '', $file);
}
/* Add reserved Crayon element. This is used by Crayon internally. */
$crayon_element = new CrayonElement(self::CRAYON_ELEMENT, $path, self::CRAYON_ELEMENT_REGEX);
$lang->element(self::CRAYON_ELEMENT, $crayon_element);
// Extract elements, classes and regex
$pattern = '#^[ \t]*([\w:]+)[ \t]+(?:\[([\w\t ]*)\][ \t]+)?([^\r\n]+)[ \t]*#m';
preg_match_all($pattern, $file, $matches);
if (!empty($matches[0])) {
$elements = $matches[1];
$classes = $matches[2];
$regexes = $matches[3];
} else {
CrayonLog::syslog("No regex patterns and/or elements were parsed from language file at '$path'.");
}
// Remember state in case we encounter catchable exceptions
$error = FALSE;
for ($i = 0; $i < count($matches[0]); $i++) {
// References
$name = &$elements[$i];
$class = &$classes[$i];
$regex = &$regexes[$i];
$name = trim(strtoupper($name));
// Ensure both the element and regex are valid
if (empty($name) || empty($regex)) {
CrayonLog::syslog("Element(s) and/or regex(es) are missing in '$path'.");
$error = TRUE;
continue;
}
// Look for fallback element
$pieces = explode(':', $name);
if (count($pieces) == 2) {
$name = $pieces[0];
$fallback = $pieces[1];
} else if (count($pieces) == 1) {
$name = $pieces[0];
$fallback = '';
} else {
CrayonLog::syslog("Too many colons found in element name '$name' in '$path'");
$error = TRUE;
continue;
}
// Create a new CrayonElement
$element = new CrayonElement($name, $path);
$element->fallback($fallback);
if (!empty($class)) {
// Avoid setting known css to blank
$element->css($class);
}
if ($element->regex($regex) === FALSE) {
$error = TRUE;
continue;
}
// Add the regex to the element
$lang->element($name, $element);
$state = $error ? CrayonLang::PARSED_ERRORS : CrayonLang::PARSED_SUCCESS;
$lang->state($state);
}
/* Prevents < > and other html entities from being printed as is, which could lead to actual html tags
* from the printed code appearing on the page - not good. This can also act to color any HTML entities
* that are not picked up by previously defined elements.
*/
$html = new CrayonElement(self::HTML_CHAR, $path, self::HTML_CHAR_REGEX);
$lang->element(self::HTML_CHAR, $html);
}
// Validates regex and accesses data stored in a CrayonElement
public static function validate_regex($regex, $element) {
if (is_string($regex) && @get_class($element) == CRAYON_ELEMENT_CLASS) {
// If the (?alt) tag has been used, insert the file into the regex
$file = self::regex_match('#\(\?alt:(.+?)\)#', $regex);
if ( count($file) == 2 ) {
// Element 0 has full match, 1 has captured groups
for ($i = 0; $i < count($file[1]); $i++) {
$file_lines = CrayonUtil::lines(dirname($element->path()) . crayon_s() . $file[1][$i], 'rcwh');
if ($file_lines !== FALSE) {
$file_lines = implode('|', $file_lines);
// If any spaces exist, treat them as whitespace
$file_lines = preg_replace('#[ \t]+#msi', '\s+', $file_lines);
$regex = str_replace($file[0][$i], "(?:$file_lines)", $regex);
} else {
CrayonLog::syslog("Parsing of '{$element->path()}' failed, an (?alt) tag failed for the element '{$element->name()}'" );
return FALSE;
}
}
}
// If the (?default:element) function is used, replace the regex with the default, if exists
$def = self::regex_match('#\(\?default(?:\:(\w+))?\)#', $regex);
if ( count($def) == 2 ) {
// Load default language
$default = CrayonResources::langs()->get(CrayonLangs::DEFAULT_LANG);
// If default has not been loaded, we can't use it, skip the element
if (!$default) {
CrayonLog::syslog(
"Could not use default regex in the element '{$element->name()}' in '{$element->path()}'");
return FALSE;
}
for ($i = 0; $i < count($def[1]); $i++) {
// If an element has been provided
$element_name = ( !empty($def[1][$i]) ) ? $def[1][$i] : $element->name();
if (($default_element = $default->element($element_name)) != FALSE) {
$regex = str_replace($def[0][$i], '(?:' . $default_element->regex() .')', $regex);
} else {
CrayonLog::syslog("The language at '{$element->path()}' referred to the Default Language regex for element '{$element->name()}', which did not exist.");
if (CRAYON_DEBUG) {
CrayonLog::syslog("Default language URL: " . CrayonResources::langs()->url(CrayonLangs::DEFAULT_LANG));
CrayonLog::syslog("Default language Path: " . CrayonResources::langs()->path(CrayonLangs::DEFAULT_LANG));
}
return FALSE;
}
}
}
// If the (?html) tag is used, escape characters in html (<, > and &)
$html = self::regex_match('#\(\?html:(.+?)\)#', $regex);
if ( count($html) == 2 ) {
for ($i = 0; $i < count($html[1]); $i++) {
$regex = str_replace($html[0][$i], htmlentities($html[1][$i]), $regex);
}
}
// Ensure all parenthesis are atomic to avoid conflicting with element matches
$regex = CrayonUtil::esc_atomic($regex);
// Escape #, this is our delimiter
$regex = CrayonUtil::esc_hash($regex);
// Test if regex is valid
if (@preg_match("#$regex#", '') === FALSE) {
CrayonLog::syslog("The regex for the element '{$element->name()}' in '{$element->path()}' is not valid.");
return FALSE;
}
return $regex;
} else {
return '';
}
}
public static function validate_css($css) {
if (is_string($css)) {
// Remove dots in CSS class and convert to lowercase
$css = str_replace('.', '', $css);
$css = strtolower($css);
$css = explode(' ', $css);
$css_str = '';
foreach ($css as $c) {
if (!empty($c)) {
$css_str .= $c . ' ';
}
}
return trim($css_str);
} else {
return '';
}
}
public static function regex_match($pattern, $subject) {
if (preg_match_all($pattern, $subject, $matches)) {
return $matches;
}
return array();
}
public static function modes() {
return self::$modes;
}
public static function is_mode($name) {
return is_string($name) && array_key_exists($name, self::$modes);
}
}
?>