crayon-syntax-highlighter/crayon_parser.class.php

276 lines
11 KiB
PHP

<?php
require_once('global.php');
require_once(CRAYON_LANGS_PHP);
/* Manages parsing the syntax for any given language, constructing the regex, and validating the
elements. */
class CrayonParser
{
// Properties and Constants ===============================================
const CASE_INSENSITIVE = 'CASE_INSENSITIVE';
const MULTI_LINE = 'MULTI_LINE';
const SINGLE_LINE = 'SINGLE_LINE';
const ALLOW_MIXED = 'ALLOW_MIXED';
//const NO_END_TAG = '(?![^<]*>)'; // No longer used
const HTML_CHAR = 'HTML_CHAR';
const HTML_CHAR_REGEX = '<|>|(&([\w-]+);?)|[ \t]+';
const CRAYON_ELEMENT = 'CRAYON_ELEMENT';
const CRAYON_ELEMENT_REGEX = '\{\{crayon-internal:[^\}]*\}\}';
const CRAYON_ELEMENT_REGEX_CAPTURE = '\{\{crayon-internal:([^\}]*)\}\}';
private static $modes = array(self::CASE_INSENSITIVE => TRUE, self::MULTI_LINE => TRUE, self::SINGLE_LINE => TRUE, self::ALLOW_MIXED => TRUE);
// Methods ================================================================
private function __construct()
{
}
/**
* Parse all languages stored in CrayonLangs.
* Avoid using this unless you must list the details in language files for all languages.
* @return array Array of all loaded CrayonLangs.
*/
public static function parse_all()
{
$langs = CrayonResources::langs()->get();
if (empty($langs)) {
return FALSE;
}
foreach ($langs as $lang) {
self::parse($lang->id());
}
return $langs;
}
/* Read a syntax file and parse the regex rules within it, this may require several other
files containing lists of keywords and such to be read. Updates the parsed elements and
regex in the CrayonLang with the given $id. */
public static function parse($id)
{
// Verify the language is loaded and has not been parsed before
if (!($lang = CrayonResources::langs()->get($id))) {
CrayonLog::syslog("The language with id '$id' was not loaded and could not be parsed.");
return FALSE;
} else if ($lang->is_parsed()) {
return;
}
// Read language file
$path = CrayonResources::langs()->path($id);
CrayonLog::debug('Parsing language ' . $path);
if (($file = CrayonUtil::lines($path, 'wcs')) === FALSE) {
CrayonLog::debug('Parsing failed ' . $path);
return FALSE;
}
// Extract the language name
$name_pattern = '#^[ \t]*name[ \t]+([^\r\n]+)[ \t]*#mi';
preg_match($name_pattern, $file, $name);
if (count($name) > 1) {
$name = $name[1];
$lang->name($name);
$file = preg_replace($name_pattern, '', $file);
} else {
$name = $lang->id();
}
// Extract the language version
$version_pattern = '#^[ \t]*version[ \t]+([^\r\n]+)[ \t]*#mi';
preg_match($version_pattern, $file, $version);
if (count($version) > 1) {
$version = $version[1];
$lang->version($version);
$file = preg_replace($version_pattern, '', $file);
}
// Extract the modes
$mode_pattern = '#^[ \t]*(' . implode('|', array_keys(self::$modes)) . ')[ \t]+(?:=[ \t]*)?([^\r\n]+)[ \t]*#mi';
preg_match_all($mode_pattern, $file, $mode_matches);
if (count($mode_matches) == 3) {
for ($i = 0; $i < count($mode_matches[0]); $i++) {
$lang->mode($mode_matches[1][$i], $mode_matches[2][$i]);
}
$file = preg_replace($mode_pattern, '', $file);
}
/* Add reserved Crayon element. This is used by Crayon internally. */
$crayon_element = new CrayonElement(self::CRAYON_ELEMENT, $path, self::CRAYON_ELEMENT_REGEX);
$lang->element(self::CRAYON_ELEMENT, $crayon_element);
// Extract elements, classes and regex
$pattern = '#^[ \t]*([\w:]+)[ \t]+(?:\[([\w\t ]*)\][ \t]+)?([^\r\n]+)[ \t]*#m';
preg_match_all($pattern, $file, $matches);
if (!empty($matches[0])) {
$elements = $matches[1];
$classes = $matches[2];
$regexes = $matches[3];
} else {
CrayonLog::syslog("No regex patterns and/or elements were parsed from language file at '$path'.");
}
// Remember state in case we encounter catchable exceptions
$error = FALSE;
for ($i = 0; $i < count($matches[0]); $i++) {
// References
$name = &$elements[$i];
$class = &$classes[$i];
$regex = &$regexes[$i];
$name = trim(strtoupper($name));
// Ensure both the element and regex are valid
if (empty($name) || empty($regex)) {
CrayonLog::syslog("Element(s) and/or regex(es) are missing in '$path'.");
$error = TRUE;
continue;
}
// Look for fallback element
$pieces = explode(':', $name);
if (count($pieces) == 2) {
$name = $pieces[0];
$fallback = $pieces[1];
} else if (count($pieces) == 1) {
$name = $pieces[0];
$fallback = '';
} else {
CrayonLog::syslog("Too many colons found in element name '$name' in '$path'");
$error = TRUE;
continue;
}
// Create a new CrayonElement
$element = new CrayonElement($name, $path);
$element->fallback($fallback);
if (!empty($class)) {
// Avoid setting known css to blank
$element->css($class);
}
if ($element->regex($regex) === FALSE) {
$error = TRUE;
continue;
}
// Add the regex to the element
$lang->element($name, $element);
$state = $error ? CrayonLang::PARSED_ERRORS : CrayonLang::PARSED_SUCCESS;
$lang->state($state);
}
/* Prevents < > and other html entities from being printed as is, which could lead to actual html tags
* from the printed code appearing on the page - not good. This can also act to color any HTML entities
* that are not picked up by previously defined elements.
*/
$html = new CrayonElement(self::HTML_CHAR, $path, self::HTML_CHAR_REGEX);
$lang->element(self::HTML_CHAR, $html);
}
// Validates regex and accesses data stored in a CrayonElement
public static function validate_regex($regex, $element)
{
if (is_string($regex) && @get_class($element) == CRAYON_ELEMENT_CLASS) {
// If the (?alt) tag has been used, insert the file into the regex
$file = self::regex_match('#\(\?alt:(.+?)\)#', $regex);
if (count($file) == 2) {
// Element 0 has full match, 1 has captured groups
for ($i = 0; $i < count($file[1]); $i++) {
$file_lines = CrayonUtil::lines(dirname($element->path()) . crayon_s() . $file[1][$i], 'rcwh');
if ($file_lines !== FALSE) {
$file_lines = implode('|', $file_lines);
// If any spaces exist, treat them as whitespace
$file_lines = preg_replace('#[ \t]+#msi', '\s+', $file_lines);
$regex = str_replace($file[0][$i], "(?:$file_lines)", $regex);
} else {
CrayonLog::syslog("Parsing of '{$element->path()}' failed, an (?alt) tag failed for the element '{$element->name()}'");
return FALSE;
}
}
}
// If the (?default:element) function is used, replace the regex with the default, if exists
$def = self::regex_match('#\(\?default(?:\:(\w+))?\)#', $regex);
if (count($def) == 2) {
// Load default language
$default = CrayonResources::langs()->get(CrayonLangs::DEFAULT_LANG);
// If default has not been loaded, we can't use it, skip the element
if (!$default) {
CrayonLog::syslog(
"Could not use default regex in the element '{$element->name()}' in '{$element->path()}'");
return FALSE;
}
for ($i = 0; $i < count($def[1]); $i++) {
// If an element has been provided
$element_name = (!empty($def[1][$i])) ? $def[1][$i] : $element->name();
if (($default_element = $default->element($element_name)) != FALSE) {
$regex = str_replace($def[0][$i], '(?:' . $default_element->regex() . ')', $regex);
} else {
CrayonLog::syslog("The language at '{$element->path()}' referred to the Default Language regex for element '{$element->name()}', which did not exist.");
if (CRAYON_DEBUG) {
CrayonLog::syslog("Default language URL: " . CrayonResources::langs()->url(CrayonLangs::DEFAULT_LANG));
CrayonLog::syslog("Default language Path: " . CrayonResources::langs()->path(CrayonLangs::DEFAULT_LANG));
}
return FALSE;
}
}
}
// If the (?html) tag is used, escape characters in html (<, > and &)
$html = self::regex_match('#\(\?html:(.+?)\)#', $regex);
if (count($html) == 2) {
for ($i = 0; $i < count($html[1]); $i++) {
$regex = str_replace($html[0][$i], htmlentities($html[1][$i]), $regex);
}
}
// Ensure all parenthesis are atomic to avoid conflicting with element matches
$regex = CrayonUtil::esc_atomic($regex);
// Escape #, this is our delimiter
$regex = CrayonUtil::esc_hash($regex);
// Test if regex is valid
if (@preg_match("#$regex#", '') === FALSE) {
CrayonLog::syslog("The regex for the element '{$element->name()}' in '{$element->path()}' is not valid.");
return FALSE;
}
return $regex;
} else {
return '';
}
}
public static function validate_css($css)
{
if (is_string($css)) {
// Remove dots in CSS class and convert to lowercase
$css = str_replace('.', '', $css);
$css = strtolower($css);
$css = explode(' ', $css);
$css_str = '';
foreach ($css as $c) {
if (!empty($c)) {
$css_str .= $c . ' ';
}
}
return trim($css_str);
} else {
return '';
}
}
public static function regex_match($pattern, $subject)
{
if (preg_match_all($pattern, $subject, $matches)) {
return $matches;
}
return array();
}
public static function modes()
{
return self::$modes;
}
public static function is_mode($name)
{
return is_string($name) && array_key_exists($name, self::$modes);
}
}