// /** * Simple sax like xml parser for PHPTAL. * * Because PHP Xml parser libraries tends to fail giving a real xml document * representation (at the time this file was created, it was impossible to * retrieve doctypes, xml declaration, problem with comments and CDATA) this * parser was created and can be manipulated to accept some user errors * like < and < in attribute values or inside text nodes. * * @package phptal * @author Laurent Bedubourg * @see PHPTAL_Dom_Parser */ abstract class PHPTAL_XmlParser { // available parser states const ST_ROOT = 0; const ST_TEXT = 1; const ST_LT = 2; const ST_TAG_NAME = 3; const ST_TAG_CLOSE = 4; const ST_TAG_SINGLE = 5; const ST_TAG_ATTRIBUTES = 6; const ST_CDATA = 7; const ST_COMMENT = 8; const ST_DOCTYPE = 9; const ST_XMLDEC = 15; const ST_PREPROC = 10; const ST_ATTR_KEY = 11; const ST_ATTR_EQ = 12; const ST_ATTR_QUOTE = 13; const ST_ATTR_VALUE = 14; // exceptions error messages const ERR_CHARS_BEFORE_DOC_START = "Characters found before the begining of the document!"; const ERR_EXPECT_VALUE_QUOTE = "Unexpected '%s' character, expecting attribute single or double quote"; const BOM_STR = "\xef\xbb\xbf"; static $state_names = array( self::ST_ROOT => 'root node', self::ST_TEXT => 'text', self::ST_LT => 'start of tag', self::ST_TAG_NAME => 'tag name', self::ST_TAG_CLOSE => 'closing tag', self::ST_TAG_SINGLE => 'self-closing tag', self::ST_TAG_ATTRIBUTES => 'tag', self::ST_CDATA => 'CDATA', self::ST_COMMENT => 'comment', self::ST_DOCTYPE => 'doctype', self::ST_XMLDEC => 'XML declaration', self::ST_PREPROC => 'preprocessor directive', self::ST_ATTR_KEY => 'attribute name', self::ST_ATTR_EQ => 'attribute value', self::ST_ATTR_QUOTE => 'quoted attribute value', self::ST_ATTR_VALUE => 'unquoted attribute value', ); public function __construct() { $this->_file = ""; } public function parseFile($src) { if (!file_exists($src)) { throw new PHPTAL_Exception("file $src not found"); } $this->parseString(file_get_contents($src), $src); } public function parseString($src, $filename = '') { $this->_file = $filename; // remove BOM (utf8 byte order mark)... if (substr($src,0,3) == self::BOM_STR){ $src = substr($src, 3); } $this->_line = 1; $state = self::ST_ROOT; $mark = 0; $len = strlen($src); $quoteStyle = '"'; $tagname = ""; $attribute = ""; $attributes = array(); $customDoctype = false; $this->onDocumentStart(); for ($i=0; $i<$len; $i++) { $c = $src[$i]; if ($c == "\n") $this->_line++; switch ($state) { case self::ST_ROOT: if ($c == '<') { $mark = $i; // mark tag start $state = self::ST_LT; } else if (!self::isWhiteChar($c)) { $this->raiseError(self::ERR_CHARS_BEFORE_DOC_START); } break; case self::ST_TEXT: if ($c == '<') { if ($mark != $i) { $this->onElementData(substr($src, $mark, $i-$mark)); } $mark = $i; $state = self::ST_LT; } break; case self::ST_LT: if ($c == '/') { $mark = $i+1; $state = self::ST_TAG_CLOSE; } else if ($c == '?' and substr($src, $i, 4) == '?xml') { $state = self::ST_XMLDEC; } else if ($c == '?') { $state = self::ST_PREPROC; } else if ($c == '!' and substr($src, $i, 3) == '!--') { $state = self::ST_COMMENT; } else if ($c == '!' and substr($src, $i, 8) == '![CDATA[') { $state = self::ST_CDATA; } else if ($c == '!' and substr($src, $i, 8) == '!DOCTYPE') { $state = self::ST_DOCTYPE; } else if (!self::isAlpha($c)) { $state = self::ST_TEXT; } else { $mark = $i; // mark node name start $attributes = array(); $attribute = ""; $state = self::ST_TAG_NAME; } break; case self::ST_TAG_NAME: if (self::isWhiteChar($c)) { $tagname = substr($src, $mark, $i-$mark); $state = self::ST_TAG_ATTRIBUTES; } else if ($c == '/') { $tagname = substr($src, $mark, $i-$mark); $state = self::ST_TAG_SINGLE; } else if ($c == '>') { $tagname = substr($src, $mark, $i-$mark); $mark = $i+1; // mark text start $state = self::ST_TEXT; $this->onElementStart($tagname, $attributes); } break; case self::ST_TAG_CLOSE: if ($c == '>') { $tagname = rtrim(substr($src, $mark, $i-$mark)); $this->onElementClose($tagname); $mark = $i+1; // mark text start $state = self::ST_TEXT; } break; case self::ST_TAG_SINGLE: if ($c != '>') { // error } $mark = $i+1; // mark text start $state = self::ST_TEXT; $this->onElementStart($tagname, $attributes); $this->onElementClose($tagname); break; case self::ST_TAG_ATTRIBUTES: if ($c == '>') { $mark = $i+1; // mark text start $state = self::ST_TEXT; $this->onElementStart($tagname, $attributes); } else if ($c == '/') { $state = self::ST_TAG_SINGLE; } else if (self::isWhiteChar($c)) { } else { $mark = $i; // mark attribute key start $state = self::ST_ATTR_KEY; } break; case self::ST_COMMENT: if ($c == '>' and substr($src, $i-2, 2) == '--') { $this->onComment(substr($src, $mark, $i-$mark+1)); $mark = $i+1; // mark text start $state = self::ST_TEXT; } break; case self::ST_CDATA: if ($c == '>' and substr($src, $i-2, 2) == ']]') { $this->onSpecific(substr($src, $mark, $i-$mark+1)); $mark = $i+1; // mark text start $state = self::ST_TEXT; } break; case self::ST_XMLDEC: if ($c == '?' && substr($src, $i, 2) == '?>') { $this->onXmlDecl(substr($src, $mark, $i-$mark+2)); $i++; // skip '>' $mark = $i+1; // mark text start $state = self::ST_TEXT; } break; case self::ST_DOCTYPE: if ($c == '[') { $customDoctype = true; } else if ($customDoctype && $c == '>' && substr($src, $i-1, 2) == ']>'){ $customDoctype = false; $this->onDocType(substr($src, $mark, $i-$mark+1)); $mark = $i+1; // mark text start $state = self::ST_TEXT; } else if (!$customDoctype && $c == '>') { $customDoctype = false; $this->onDocType(substr($src, $mark, $i-$mark+1)); $mark = $i+1; // mark text start $state = self::ST_TEXT; } break; case self::ST_PREPROC: if ($c == '>' and $src[$i-1] == '?') { $this->onSpecific(substr($src, $mark, $i-$mark+1)); $mark = $i+1; // mark text start $state = self::ST_TEXT; } break; case self::ST_ATTR_KEY: if (self::isWhiteChar($c)) { $attribute = substr($src, $mark, $i-$mark); $state = self::ST_ATTR_EQ; } else if ($c == '=') { $attribute = substr($src, $mark, $i-$mark); $state = self::ST_ATTR_VALUE; } break; case self::ST_ATTR_EQ: if ($c == '=') { $state = self::ST_ATTR_VALUE; } break; case self::ST_ATTR_VALUE: if (self::isWhiteChar($c)){ } else if ($c == '"' or $c == '\'') { $quoteStyle = $c; $state = self::ST_ATTR_QUOTE; $mark = $i+1; // mark attribute real value start } else { $err = self::ERR_EXPECT_VALUE_QUOTE; $err = sprintf($err, $c); $this->raiseError($err); } break; case self::ST_ATTR_QUOTE: if ($c == $quoteStyle) { if (isset($attributes[$attribute])) $this->raiseError("Attribute '$attribute' on '$tagname' is defined more than once"); $attributes[$attribute] = substr($src, $mark, $i-$mark); $state = self::ST_TAG_ATTRIBUTES; } break; } } if ($state == self::ST_TEXT) // allows text past root node, which is in violation of XML spec { if ($i > $mark) { $text = substr($src, $mark, $i-$mark); //if (!ctype_space($text)) $this->onElementData($text); if (!ctype_space($text)) $this->raiseError("Characters found after end of the root element"); } } else { throw new PHPTAL_Exception("Finished document in unexpected state: ".self::$state_names[$state]." is not finished"); } $this->onDocumentEnd(); } public function getSourceFile() { return $this->_file; } public function getLineNumber() { return $this->_line; } public static function isWhiteChar($c) { return strpos(" \t\n\r\0", $c) !== false; } public static function isAlpha($c) { $char = strtolower($c); return ($char >= 'a' && $char <= 'z'); } public abstract function onDocType($doctype); public abstract function onXmlDecl($decl); public abstract function onSpecific($data); public abstract function onComment($data); public abstract function onElementStart($name, $attributes); public abstract function onElementClose($name); public abstract function onElementData($data); public abstract function onDocumentStart(); public abstract function onDocumentEnd(); protected function raiseError($errFmt) { $args = func_get_args(); $errStr = call_user_func_array('sprintf', $args); $str = "%s error: %s in %s:%d"; $str = sprintf($str, get_class($this), $errStr, $this->_file, $this->_line); throw new PHPTAL_Exception($str); } private $_file; private $_line; private $_source; } ?>