2019-08-30 19:30:19 +02:00
< ? php
2021-11-29 01:25:11 +01:00
require_once ( 'global.php' );
require_once ( CRAYON_LANGS_PHP );
2019-08-30 19:30:19 +02:00
/* Manages parsing the syntax for any given language , constructing the regex , and validating the
elements . */
2021-11-29 01:25:11 +01:00
class CrayonParser
{
// Properties and Constants ===============================================
const CASE_INSENSITIVE = 'CASE_INSENSITIVE' ;
const MULTI_LINE = 'MULTI_LINE' ;
const SINGLE_LINE = 'SINGLE_LINE' ;
const ALLOW_MIXED = 'ALLOW_MIXED' ;
//const NO_END_TAG = '(?![^<]*>)'; // No longer used
const HTML_CHAR = 'HTML_CHAR' ;
const HTML_CHAR_REGEX = '<|>|(&([\w-]+);?)|[ \t]+' ;
const CRAYON_ELEMENT = 'CRAYON_ELEMENT' ;
const CRAYON_ELEMENT_REGEX = '\{\{crayon-internal:[^\}]*\}\}' ;
const CRAYON_ELEMENT_REGEX_CAPTURE = '\{\{crayon-internal:([^\}]*)\}\}' ;
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
private static $modes = array ( self :: CASE_INSENSITIVE => TRUE , self :: MULTI_LINE => TRUE , self :: SINGLE_LINE => TRUE , self :: ALLOW_MIXED => TRUE );
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
// Methods ================================================================
private function __construct ()
{
}
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
/**
* Parse all languages stored in CrayonLangs .
* Avoid using this unless you must list the details in language files for all languages .
* @ return array Array of all loaded CrayonLangs .
*/
public static function parse_all ()
{
$langs = CrayonResources :: langs () -> get ();
if ( empty ( $langs )) {
return FALSE ;
}
foreach ( $langs as $lang ) {
self :: parse ( $lang -> id ());
}
return $langs ;
}
/* Read a syntax file and parse the regex rules within it , this may require several other
files containing lists of keywords and such to be read . Updates the parsed elements and
regex in the CrayonLang with the given $id . */
public static function parse ( $id )
{
// Verify the language is loaded and has not been parsed before
if ( ! ( $lang = CrayonResources :: langs () -> get ( $id ))) {
CrayonLog :: syslog ( " The language with id ' $id ' was not loaded and could not be parsed. " );
return FALSE ;
} else if ( $lang -> is_parsed ()) {
return ;
}
// Read language file
$path = CrayonResources :: langs () -> path ( $id );
2019-08-30 19:30:19 +02:00
CrayonLog :: debug ( 'Parsing language ' . $path );
2021-11-29 01:25:11 +01:00
if (( $file = CrayonUtil :: lines ( $path , 'wcs' )) === FALSE ) {
2019-08-30 19:30:19 +02:00
CrayonLog :: debug ( 'Parsing failed ' . $path );
2021-11-29 01:25:11 +01:00
return FALSE ;
}
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
// Extract the language name
$name_pattern = '#^[ \t]*name[ \t]+([^\r\n]+)[ \t]*#mi' ;
preg_match ( $name_pattern , $file , $name );
if ( count ( $name ) > 1 ) {
$name = $name [ 1 ];
$lang -> name ( $name );
$file = preg_replace ( $name_pattern , '' , $file );
} else {
$name = $lang -> id ();
}
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
// Extract the language version
$version_pattern = '#^[ \t]*version[ \t]+([^\r\n]+)[ \t]*#mi' ;
preg_match ( $version_pattern , $file , $version );
if ( count ( $version ) > 1 ) {
$version = $version [ 1 ];
$lang -> version ( $version );
$file = preg_replace ( $version_pattern , '' , $file );
}
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
// Extract the modes
$mode_pattern = '#^[ \t]*(' . implode ( '|' , array_keys ( self :: $modes )) . ')[ \t]+(?:=[ \t]*)?([^\r\n]+)[ \t]*#mi' ;
preg_match_all ( $mode_pattern , $file , $mode_matches );
if ( count ( $mode_matches ) == 3 ) {
for ( $i = 0 ; $i < count ( $mode_matches [ 0 ]); $i ++ ) {
$lang -> mode ( $mode_matches [ 1 ][ $i ], $mode_matches [ 2 ][ $i ]);
}
$file = preg_replace ( $mode_pattern , '' , $file );
}
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
/* Add reserved Crayon element. This is used by Crayon internally. */
$crayon_element = new CrayonElement ( self :: CRAYON_ELEMENT , $path , self :: CRAYON_ELEMENT_REGEX );
$lang -> element ( self :: CRAYON_ELEMENT , $crayon_element );
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
// Extract elements, classes and regex
$pattern = '#^[ \t]*([\w:]+)[ \t]+(?:\[([\w\t ]*)\][ \t]+)?([^\r\n]+)[ \t]*#m' ;
preg_match_all ( $pattern , $file , $matches );
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
if ( ! empty ( $matches [ 0 ])) {
$elements = $matches [ 1 ];
$classes = $matches [ 2 ];
$regexes = $matches [ 3 ];
} else {
CrayonLog :: syslog ( " No regex patterns and/or elements were parsed from language file at ' $path '. " );
}
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
// Remember state in case we encounter catchable exceptions
$error = FALSE ;
for ( $i = 0 ; $i < count ( $matches [ 0 ]); $i ++ ) {
// References
$name = & $elements [ $i ];
$class = & $classes [ $i ];
$regex = & $regexes [ $i ];
$name = trim ( strtoupper ( $name ));
// Ensure both the element and regex are valid
if ( empty ( $name ) || empty ( $regex )) {
CrayonLog :: syslog ( " Element(s) and/or regex(es) are missing in ' $path '. " );
$error = TRUE ;
continue ;
}
// Look for fallback element
$pieces = explode ( ':' , $name );
if ( count ( $pieces ) == 2 ) {
$name = $pieces [ 0 ];
$fallback = $pieces [ 1 ];
} else if ( count ( $pieces ) == 1 ) {
$name = $pieces [ 0 ];
$fallback = '' ;
} else {
CrayonLog :: syslog ( " Too many colons found in element name ' $name ' in ' $path ' " );
$error = TRUE ;
continue ;
}
// Create a new CrayonElement
$element = new CrayonElement ( $name , $path );
$element -> fallback ( $fallback );
if ( ! empty ( $class )) {
// Avoid setting known css to blank
$element -> css ( $class );
}
if ( $element -> regex ( $regex ) === FALSE ) {
$error = TRUE ;
continue ;
}
// Add the regex to the element
$lang -> element ( $name , $element );
$state = $error ? CrayonLang :: PARSED_ERRORS : CrayonLang :: PARSED_SUCCESS ;
$lang -> state ( $state );
}
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
/* Prevents < > and other html entities from being printed as is , which could lead to actual html tags
* from the printed code appearing on the page - not good . This can also act to color any HTML entities
* that are not picked up by previously defined elements .
*/
$html = new CrayonElement ( self :: HTML_CHAR , $path , self :: HTML_CHAR_REGEX );
$lang -> element ( self :: HTML_CHAR , $html );
}
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
// Validates regex and accesses data stored in a CrayonElement
public static function validate_regex ( $regex , $element )
{
if ( is_string ( $regex ) && @ get_class ( $element ) == CRAYON_ELEMENT_CLASS ) {
// If the (?alt) tag has been used, insert the file into the regex
$file = self :: regex_match ( '#\(\?alt:(.+?)\)#' , $regex );
if ( count ( $file ) == 2 ) {
// Element 0 has full match, 1 has captured groups
for ( $i = 0 ; $i < count ( $file [ 1 ]); $i ++ ) {
$file_lines = CrayonUtil :: lines ( dirname ( $element -> path ()) . crayon_s () . $file [ 1 ][ $i ], 'rcwh' );
if ( $file_lines !== FALSE ) {
$file_lines = implode ( '|' , $file_lines );
// If any spaces exist, treat them as whitespace
$file_lines = preg_replace ( '#[ \t]+#msi' , '\s+' , $file_lines );
$regex = str_replace ( $file [ 0 ][ $i ], " (?: $file_lines ) " , $regex );
} else {
CrayonLog :: syslog ( " Parsing of ' { $element -> path () } ' failed, an (?alt) tag failed for the element ' { $element -> name () } ' " );
return FALSE ;
}
}
}
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
// If the (?default:element) function is used, replace the regex with the default, if exists
$def = self :: regex_match ( '#\(\?default(?:\:(\w+))?\)#' , $regex );
if ( count ( $def ) == 2 ) {
// Load default language
$default = CrayonResources :: langs () -> get ( CrayonLangs :: DEFAULT_LANG );
// If default has not been loaded, we can't use it, skip the element
if ( ! $default ) {
CrayonLog :: syslog (
" Could not use default regex in the element ' { $element -> name () } ' in ' { $element -> path () } ' " );
return FALSE ;
}
for ( $i = 0 ; $i < count ( $def [ 1 ]); $i ++ ) {
// If an element has been provided
$element_name = ( ! empty ( $def [ 1 ][ $i ])) ? $def [ 1 ][ $i ] : $element -> name ();
if (( $default_element = $default -> element ( $element_name )) != FALSE ) {
$regex = str_replace ( $def [ 0 ][ $i ], '(?:' . $default_element -> regex () . ')' , $regex );
} else {
CrayonLog :: syslog ( " The language at ' { $element -> path () } ' referred to the Default Language regex for element ' { $element -> name () } ', which did not exist. " );
2019-08-30 19:30:19 +02:00
if ( CRAYON_DEBUG ) {
CrayonLog :: syslog ( " Default language URL: " . CrayonResources :: langs () -> url ( CrayonLangs :: DEFAULT_LANG ));
CrayonLog :: syslog ( " Default language Path: " . CrayonResources :: langs () -> path ( CrayonLangs :: DEFAULT_LANG ));
}
2021-11-29 01:25:11 +01:00
return FALSE ;
}
}
}
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
// If the (?html) tag is used, escape characters in html (<, > and &)
$html = self :: regex_match ( '#\(\?html:(.+?)\)#' , $regex );
if ( count ( $html ) == 2 ) {
for ( $i = 0 ; $i < count ( $html [ 1 ]); $i ++ ) {
$regex = str_replace ( $html [ 0 ][ $i ], htmlentities ( $html [ 1 ][ $i ]), $regex );
}
}
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
// Ensure all parenthesis are atomic to avoid conflicting with element matches
$regex = CrayonUtil :: esc_atomic ( $regex );
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
// Escape #, this is our delimiter
$regex = CrayonUtil :: esc_hash ( $regex );
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
// Test if regex is valid
if ( @ preg_match ( " # $regex # " , '' ) === FALSE ) {
CrayonLog :: syslog ( " The regex for the element ' { $element -> name () } ' in ' { $element -> path () } ' is not valid. " );
return FALSE ;
}
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
return $regex ;
} else {
return '' ;
}
}
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
public static function validate_css ( $css )
{
if ( is_string ( $css )) {
// Remove dots in CSS class and convert to lowercase
$css = str_replace ( '.' , '' , $css );
$css = strtolower ( $css );
$css = explode ( ' ' , $css );
$css_str = '' ;
foreach ( $css as $c ) {
if ( ! empty ( $c )) {
$css_str .= $c . ' ' ;
}
}
return trim ( $css_str );
} else {
return '' ;
}
}
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
public static function regex_match ( $pattern , $subject )
{
if ( preg_match_all ( $pattern , $subject , $matches )) {
return $matches ;
}
return array ();
}
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
public static function modes ()
{
return self :: $modes ;
}
2019-08-30 19:30:19 +02:00
2021-11-29 01:25:11 +01:00
public static function is_mode ( $name )
{
return is_string ( $name ) && array_key_exists ( $name , self :: $modes );
}
2019-08-30 19:30:19 +02:00
}