清理混乱的HTML——为tom新博客写的简单的php tidy,
时间:2007-11-08
来源:互联网
项目背景:newblog.tom.com tom新博客过滤程序
用PHP的Tidy来清理html, 由于php tidy不支持gb编码,而且博客采用的是gbk编码,每次调用tidy前我都用iconv转换到utf8 ,清理后再转换回来,因此也导致了一些字符在这个过程中出现问题,最终不能正常显示(为什么博客一开始就不用utf8编码啊,想不通,解决问题才是关键! 继续…) ,最终决定自己实现一个tidy,今天花了一点时间写了一个0.1版本,请大家指教哈。
[ 本帖最后由 murder 于 2007-11-9 17:20 编辑 ]
用PHP的Tidy来清理html, 由于php tidy不支持gb编码,而且博客采用的是gbk编码,每次调用tidy前我都用iconv转换到utf8 ,清理后再转换回来,因此也导致了一些字符在这个过程中出现问题,最终不能正常显示(为什么博客一开始就不用utf8编码啊,想不通,解决问题才是关键! 继续…) ,最终决定自己实现一个tidy,今天花了一点时间写了一个0.1版本,请大家指教哈。
复制PHP内容到剪贴板
<?php
/**
* BlogTidy
* a tidy class for TOM BLOG
*
* @author [MSN]murderxchip(at)gmail.com
* @version 0.1
* @copyright tom newblog team
*
* @package class
*/
class BlogTidy {
private $tags; //tag stack
private $tidy_content; //tidy content
private $src_content; //source content
private $content_length;
private $pointer; //string pointer
private $pattern_tag_start = '/(<\w+?>)/i';
private $pattern_tag_end = '/(<\/[^>]+?>)/i';
private $pattern_tag = '/(<[^>]+?>)/i';
function __construct(){
$this->tags = array(); // tag stack
$this->pointer = 0; //字符串指针
}
function __destruct(){
unset($this->tags);
$this->tags = null;
}
/**
* 执行tidy
*
* @param string $srcContent
*/
public function tidy($srcContent){
$this->src_content = $srcContent;
$this->content_length = strlen($srcContent);
$this->process();
return $this->tidy_content;
}
private function getTagStartName($tagString, &$tagName){
$result = preg_match('/<(\w+)/', $tagString, $match);
$tagName = $match[1];
return $result;
}
private function getTagEndName($tagString, &$tagName){
$result = preg_match('/<\/(\w+)>/', $tagString, $match);
$tagName = $match[1];
return $result;
}
private function getTagName($tagString){
$result = preg_match('/<\/*(\w+)/', $tagString, $match);
return $match[1];
}
private function process(){
while(preg_match($this->pattern_tag, $this->src_content, $match, null, $this->pointer)){
$tagstring = $match[0];
$tagType = $this->getTagType($tagstring);
switch ($tagType){
case 'START':
$this->processTagStart($tagstring);
break;
case 'END':
$this->processTagEnd($tagstring);
break;
default:
return;
break;
}
}
while(count($this->tags)){
$this->tidy_content .= sprintf('</%s>', array_pop($this->tags));
}
//剩余字符
$this->tidy_content .= substr($this->src_content, $this->pointer, $this->content_length - $this->pointer);
}
private function getTagType($tagstring){
if(preg_match('/^<\//', $tagstring)){
return 'END';
}elseif (preg_match('/^<\w+/', $tagstring)){
return 'START';
}else {
return null;
}
}
/**
* 处理开始标签
*
* @param string $tagstring 标签字符串
*/
private function processTagStart($tagstring){
$pos = strpos($this->src_content, $tagstring, $this->pointer);
$len = $pos + strlen($tagstring) - $this->pointer;
$sss = substr($this->src_content, $this->pointer, $len);
$this->tidy_content .= $sss;
$this->pointer += strlen($sss);
$tagname = '';
$ret = $this->getTagStartName($tagstring, $tagname);
array_push($this->tags, $tagname); //tag入栈
}
/**
* 处理结束标签
*
* @param string $tagstring 标签字符串
*/
private function processTagEnd($tagstring){
$tagname = '';
$ret = $this->getTagEndName($tagstring, $tagname);
$pos = strpos($this->src_content, $tagstring, $this->pointer);
$len = $pos + strlen($tagstring) - $this->pointer;
$sss = substr($this->src_content, $this->pointer, $len);
$this->pointer += strlen($sss);
$closetag = array_pop($this->tags);
if(empty($closetag)){ //tag stack 已经空了,
$sss = str_replace($tagstring, '', $sss);
}else if( 0 !== strcmp($tagname, $closetag)){
$sss = preg_replace($this->pattern_tag_end, '</' .$closetag. '>', $sss);
}
$this->tidy_content .= $sss;
}
public function debug(){
echo $this->tidy_content;
}
// public function
}
//测试一下
$test_string = "<table><tr><td><p style='adf'>xxxr<span style=color:red>bbb</i><b>asdfasdfwe</b></tr></td>";
//
$b = new BlogTidy();
$b->tidy($test_string);
$b->debug();
?>
输出结果: <table><tr><td><p style='adf'>xxxr<span style=color:red>bbb</span><b>asdfasdfwe</b></p></td></tr></table>PHP代码:
<?php
/**
* BlogTidy
* a tidy class for TOM BLOG
*
* @author [MSN]murderxchip(at)gmail.com
* @version 0.1
* @copyright tom newblog team
*
* @package class
*/
class BlogTidy {
private $tags; //tag stack
private $tidy_content; //tidy content
private $src_content; //source content
private $content_length;
private $pointer; //string pointer
private $pattern_tag_start = '/(<\w+?>)/i';
private $pattern_tag_end = '/(<\/[^>]+?>)/i';
private $pattern_tag = '/(<[^>]+?>)/i';
function __construct(){
$this->tags = array(); // tag stack
$this->pointer = 0; //字符串指针
}
function __destruct(){
unset($this->tags);
$this->tags = null;
}
/**
* 执行tidy
*
* @param string $srcContent
*/
public function tidy($srcContent){
$this->src_content = $srcContent;
$this->content_length = strlen($srcContent);
$this->process();
return $this->tidy_content;
}
private function getTagStartName($tagString, &$tagName){
$result = preg_match('/<(\w+)/', $tagString, $match);
$tagName = $match[1];
return $result;
}
private function getTagEndName($tagString, &$tagName){
$result = preg_match('/<\/(\w+)>/', $tagString, $match);
$tagName = $match[1];
return $result;
}
private function getTagName($tagString){
$result = preg_match('/<\/*(\w+)/', $tagString, $match);
return $match[1];
}
private function process(){
while(preg_match($this->pattern_tag, $this->src_content, $match, null, $this->pointer)){
$tagstring = $match[0];
$tagType = $this->getTagType($tagstring);
switch ($tagType){
case 'START':
$this->processTagStart($tagstring);
break;
case 'END':
$this->processTagEnd($tagstring);
break;
default:
return;
break;
}
}
while(count($this->tags)){
$this->tidy_content .= sprintf('</%s>', array_pop($this->tags));
}
//剩余字符
$this->tidy_content .= substr($this->src_content, $this->pointer, $this->content_length - $this->pointer);
}
private function getTagType($tagstring){
if(preg_match('/^<\//', $tagstring)){
return 'END';
}elseif (preg_match('/^<\w+/', $tagstring)){
return 'START';
}else {
return null;
}
}
/**
* 处理开始标签
*
* @param string $tagstring 标签字符串
*/
private function processTagStart($tagstring){
$pos = strpos($this->src_content, $tagstring, $this->pointer);
$len = $pos + strlen($tagstring) - $this->pointer;
$sss = substr($this->src_content, $this->pointer, $len);
$this->tidy_content .= $sss;
$this->pointer += strlen($sss);
$tagname = '';
$ret = $this->getTagStartName($tagstring, $tagname);
array_push($this->tags, $tagname); //tag入栈
}
/**
* 处理结束标签
*
* @param string $tagstring 标签字符串
*/
private function processTagEnd($tagstring){
$tagname = '';
$ret = $this->getTagEndName($tagstring, $tagname);
$pos = strpos($this->src_content, $tagstring, $this->pointer);
$len = $pos + strlen($tagstring) - $this->pointer;
$sss = substr($this->src_content, $this->pointer, $len);
$this->pointer += strlen($sss);
$closetag = array_pop($this->tags);
if(empty($closetag)){ //tag stack 已经空了,
$sss = str_replace($tagstring, '', $sss);
}else if( 0 !== strcmp($tagname, $closetag)){
$sss = preg_replace($this->pattern_tag_end, '</' .$closetag. '>', $sss);
}
$this->tidy_content .= $sss;
}
public function debug(){
echo $this->tidy_content;
}
// public function
}
//测试一下
$test_string = "<table><tr><td><p style='adf'>xxxr<span style=color:red>bbb</i><b>asdfasdfwe</b></tr></td>";
//
$b = new BlogTidy();
$b->tidy($test_string);
$b->debug();
?>
[ 本帖最后由 murder 于 2007-11-9 17:20 编辑 ]
作者: murder 发布时间: 2007-11-08
谢谢分享,收藏了

作者: 菜刀 发布时间: 2007-11-08
这个程序让我想起以前学习数据结构时。
开门要记得关门,否则嵌套时后果很严重。
效率不怎么样,有空我改改。
开门要记得关门,否则嵌套时后果很严重。
效率不怎么样,有空我改改。
作者: smallwl 发布时间: 2008-02-14


作者: luzhou 发布时间: 2008-02-14
相关阅读 更多
热门阅读
-
office 2019专业增强版最新2021版激活秘钥/序列号/激活码推荐 附激活工具
阅读:74
-
如何安装mysql8.0
阅读:31
-
Word快速设置标题样式步骤详解
阅读:28
-
20+道必知必会的Vue面试题(附答案解析)
阅读:37
-
HTML如何制作表单
阅读:22
-
百词斩可以改天数吗?当然可以,4个步骤轻松修改天数!
阅读:31
-
ET文件格式和XLS格式文件之间如何转化?
阅读:24
-
react和vue的区别及优缺点是什么
阅读:121
-
支付宝人脸识别如何关闭?
阅读:21
-
腾讯微云怎么修改照片或视频备份路径?
阅读:28