以前写的一个采集系统(代码有点乱)
时间:2008-11-25
来源:互联网
几乎可以采集防盗链的全部图片,不搞这活两年了
一共5个文件,包括函数文件,规则指定文件
循环采集文件
标注的比较清,有些正则没有整合见谅下,在自己电脑放了2年,还是把他发出来,顺便给自己做点广告
看看我的新站点: [url=http://www.biz-m.com]http://www.biz-m.com
1.函数部分
<?
function admin_del($table,$id) //-删除数据库信息
{
$where = " UserID='$id'";
#-主表
{
$result = result($table,$where,$limit);
$row = mysql_fetch_array($result);
$img = $row['img'];
delmap($img); //-删除图片
}
$delete = mysql_query("DELETE FROM $table WHERE $where"); //-删除主数据库数据
#分表
{
$tabletxt = $table."txt";
$deletetxt= mysql_query("DELETE FROM $tabletxt WHERE $where");
}
return true;
}
function delcache($dir="../cache/")
{
if (is_dir($dir))
{
if ($dh = opendir($dir))
{
while (($file = readdir($dh)) !== false)
{
if(filetype($dir.$file)=="file" && (filemtime($dir.$file)+4320)<time())
{
unlink($dir.$file);
$i++;
}
}
closedir($dh);
return $i;
}
}
}
#-- 图片到本地
function getImage($url,$path,$str,$str1,$str2)
{
$ext =strrchr(strtolower($url),".");
$rand =rand(6,100000).$str2;
if($ext != ".gif" && $ext != ".jpg" && $ext != ".png" && $ext != ".jpeg") return false;
$dir =$path."/".date("ymj");
$filename =$dir."/".time().$rand.$ext;
#-缓存到本地
if($str)
{
#-- 创建目录
if(!is_dir("../".$dir))
{
mkdir("../".$dir);
}
$_filename="../".$filename;
ob_start();
$host=$path=str_replace('http://','',$url);
$host=explode('/',$host);
$host=$host[0];
$path=strstr($path,'/');
$fp = fsockopen($host, 80, $errno, $errstr, 30);
if($fp)
{
@fputs($fp, "GET $path HTTP/1.1\r\n");
@fputs($fp, "Host: $host\r\n");
@fputs($fp, "Accept: */*\r\n");
@fputs($fp, "Referer: http://$host/\r\n");
@fputs($fp, "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)\r\n");
@fputs($fp, "Connection: Close\r\n\r\n");
}
$Content = '';
while ($_str = fread($fp, 4096))
$Content .= $_str;
$pos=strpos($Content,"\r\n\r\n");
$head=substr($Content,0,$pos);
$text=substr($Content,$pos+4);
if(strlen($text)< 1000)
{
$text [email==@file_get_contents($url]=@file_get_contents($url[/email]);
}
$fp = @fopen("$_filename","w");
@fwrite($fp,$text);
fclose($fp);
$size[$rand] = @getimagesize($_filename);
$W =$size[$rand][0];
$H =$size[$rand][1];
$_str1 =explode(',',$str1);
$imgsize =$_str1[0];
if(!$imgsize)$imgsize=2048;
$img_w =$_str1[1];
if(!$img_w)$img_w =200;
$img_h =$_str1[2];
if(!$img_h)$img_h =200;
$filesize[$rand] =filesize($_filename);
if(file_exists($_filename)&& $filesize[$rand] > $imgsize && $W > $img_w && $H > $img_h) //如果缓存本地输出本地路径
{
return $filename;
}
else
{
delmap($filename);
return false;
}
}
else return $url;
}
#采集并输出图片路径
function outImage($string,$path,$str,$str1)
{
foreach($string as $key => $value)
{
$_mapL =explode("src=",$value);
$_map_L =explode(" ",trim($_mapL[1]));
$mapL =ereg_replace("\"|'","",trim($_map_L[0]));
if($str=="new")$map[]= getImage(trim($mapL),$path,0,$str1,$key);
if($str=="all")$map[]= getImage(trim($mapL),$path,1,$str1,$key);
}
return $map; //输出图片的url
}
#获取页面链接
function get($data)
{
preg_match_all('/\<a ([^>]+)\>/is',$data,$arr);
return $arr[1];
}
#获取页面的图片
function map($data)
{
preg_match_all('/\<img ([^>]+)\>/is',$data,$arr);
return $arr[1];
}
#-- 获取全部的链接
function formaturl($l1,$l2)
{
//if(preg_match_all("/(<img[^>]+src=\"([^\"]+)\"[^>]*>)|(<a[^>]+href=\"([^\"]+)\"[^>]*>)|(<img[^>]+src='([^']+)'[^>]*>)|(<a[^>]+href='([^']+)'[^>]*>)|(<img[^>]+src=([^']+)[^>]*>)|(<a[^>]+href=([^']+)[^>]*>)/i",$l1,$regs))
$pattern = '/(href\s*=\s*("|\')?([^\s"\'].)*("|\')?(.*)>)|(src\s*=\s*("|\')?([^\s"\'].)*("|\')?(.*)>)/iU';
if(preg_match_all($pattern,$l1,$regs));
{
foreach($regs[0] as $num => $url)
{
$l1 = str_replace($url,lIIIIl($url,$l2),$l1);
}
}
return $l1;
}
#url补全
function lIIIIl($l1,$l2)
{
$l1 =preg_replace("/\"|'/","",$l1);
if(preg_match("/(.*)(href|src)\=(.+?)( |\/\>|\>).*/i",$l1,$regs))
{
$I2 = $regs[3];
}
if(strlen($I2)>0)
{
$I1 = str_replace(chr(34),"",$I2);
$I1 = str_replace(chr(39),"",$I1);
}
else
{
return $l1;
}
$url_parsed = parse_url($l2);
$scheme = $url_parsed["scheme"];
$_url2 =preg_replace("/[^\/]+$/","",$l2);
if($scheme!="")
{
$scheme = $scheme."://";
}
$host = $url_parsed["host"];
$l3 = $scheme.$host;
if(strlen($l3)==0)
{
return $l1;
}
$path = dirname($url_parsed["path"]);
$_url1 = $scheme.$host.$url_parsed["path"];
if($path[0]=="\\")
{
$path="";
}
$pos = strpos($I1,"#");
if($pos>0) $I1 = substr($I1,0,$pos);
# http开头的url类型要跳过
if(preg_match("/^(http|https|ftp):(\/\/|\\\\)(([\w\/\\\+\-~`@:%])+\.)+([\w\/\\\.\=\?\+\-~`@\':!%#]|(&)|&)+/i",$I1))
{
return $l1;
}
elseif($I1[0]=="/")
{
$I1 = $l3.$I1;
}
elseif(substr($I1,0,3)=="../")
{
while(substr($I1,0,3)=="../")
{
$I1 = substr($I1,strlen($I1)-(strlen($I1)-3),strlen($I1)-3);
if(strlen($path)>0)
{
$path = dirname($path);
}
}
$I1 = $l3."/".$I1;
}
elseif(substr($I1,0,2)=="./")
{
$I1 = $l3.$path.substr($I1,strlen($I1)-(strlen($I1)-1),strlen($I1)-1);
}
elseif(substr($I1,0,2)==" /")
{
$I1 = $l3.trim($I1);
}
elseif(substr($I1,0,1)=="?")
{
$I1 =$_url1.$I1;
}
else
{
$I1 = $l3.$path."/".$I1;
}
return str_replace($I2,"\"$I1\"",$l1);
}
#-分类
function s_l($j,$str)
{
if($j==1)$sortwords =",视频电影,幽默笑话,明星偶像,读书频道,占星测试,动漫卡通,旅游世界,娱乐八卦";
if($j==2)$sortwords =",服饰专题,美容专题,美体专题,休闲专题,情感专题,保健专题,母婴专题,两性话题,居家专题,时尚生活";
if($j==3)$sortwords =",";
if($j==4)$sortwords =",Photo技巧,Flash技巧,js 资源,php 资源,jsp资源,css 资源,源码下载,搜索优化,IT网赚";
if($j==5)$sortwords =",营销知识,管理知识,项目管理,生产管理,人力资源,成功激励,创业宝典,实用工具,标准认证,财经知识,政策法规,合同范本,经典案例,文档模板,商务礼仪,报关手册,各国通关,国际贸易,贸易结算,外汇管理,外贸知识,货物运输,加工贸易,检验检疫,笑谈商务,行业动态,系列图书,综合资讯";
$s_list =explode(",",$sortwords);
$iA =count($s_list);
for($i>0;$i<$iA;$i++)
{
$s = ($i-1);
if($s>=0)
{
if($str)echo "select2[".$j."][".$s."] = new Option(\"".trim($s_list[$i])."\", \"".$i."\"); \n";
else echo "<option value=\"$i\">$s_list[$i]</option>";
}
}
if($str)echo " \n \n";
}
#-删除图片
function delmap($img)
{
$img_array =explode(",",$img);
$iA =count($img_array);
for($i=0;$i<$iA;$i++)
{
if(($img_array[$i]) && file_exists("../$img_array[$i]"))unlink("../$img_array[$i]");
}
}
#-判读数据表
function data_table($string)
{
$array = array ('yule' =>'1','woman' =>'2','info' =>'3','photo' =>'4','business' =>'5');
while ($name = current($array))
{
if ($name == $string)
{
return key($array);
}
next($array);
}
return false;
}
#-字符串截取
function newdata($string,$open,$end)
{
if($open)$string =strstr($string,$open);
if($end)
{
$end =strpos($string,$end);
$string =substr($string,0,$end);
}
return $string;
}
#-大写转换为小写
function foo($t)
{
return strtolower($t);
return "<$t$s/$t>";
}
#-字符串替换
function strreplace($str)
{
$str = StripSlashes($str);
$str = str_replace(chr(92),'',$str);
$str = str_replace(chr(47),'',$str);
$str = str_replace(chr(10).chr(13),"<br>",$str);
$str = str_replace(';',";",$str);
$str = str_replace('<',"<",$str);
$str = str_replace('>',">",$str);
$str = str_replace('"',"“",$str);
$str = str_replace("'","‘",$str);
$str = str_replace(" "," ",$str);
$str = str_replace("\(","(",$str);
$str = str_replace("\)",")",$str);
$str = str_replace("\?","?",$str);
$str = str_replace("/**/"," ",$str);
return trim($str);
}
function strkh($str)
{
$str = str_replace('<',"<",$str);
$str = str_replace('>',">",$str);
return trim($str);
}
#-字符串过滤
function guolv($string)
{
$string =preg_replace("@\<script(.*?)\</script\>@is", "",$string);#-过滤js代码
$string =preg_replace("@\<style(.*?)\</style\>@is", "",$string); #--过滤css<iframe
$string =preg_replace("@\<iframe(.*?)\</iframe\>@is", "",$string); #--过滤iframe
$string =str_replace("/>", ">",$string);
$string =str_replace("textarea","text-area",$string);
if(eregi("utf-8",$string))$string =iconv("utf-8","gbk",$string); #-utf-8转换为gbk
$string =ereg_replace("\"","'",$string);
$string =ereg_replace("HREF=","href=",$string);
$string =ereg_replace("<IMG","<img",$string);
$string =ereg_replace("href= ","href=",$string);
$string =ereg_replace("</a>","</a>\n",$string);
$string =ereg_replace("<img","\n<img",$string);
return $string;
}
#-获取标题
function title($string,$open,$end,$del)
{
if($open =="")$open ="<title";
if($end =="")$end ="</title";
$title =trim(strip_tags(preg_replace( "@<(.*?)>@is", "",newdata(foo($string),$open,$end))));
$title =preg_replace("/\(|\)|\!/","",$title);
if($del)$title =preg_replace("/$del/","",$title);
return $title;
}
#-获取页面链接
function linkpage($contents,$open,$end,$del,$url,$str,$str1)
{
$parse_url =parse_url($url);
$host =$parse_url[host];
$contents =newdata($contents,$open,$end); #提取链接
$contents =ereg_replace("\"|'","",$contents);
if($del)
{
if(eregi("\|",$del))
{
$contents [email==@preg_replace(]=@preg_replace("/$del/","",$contents[/email]);
}
else $contents [email==@ereg_replace(]=@ereg_replace("$del","",$contents[/email]); #-过滤链接
}
$d =get($contents);
foreach($d as $i=> $v)
{
$_link_url[$i] =explode("href=",trim($v));
$_link_[$i] =explode(" ",trim($_link_url[$i][1]));
$_link[$i] =trim($_link_[$i][0]);
if(!eregi("http",$_link[$i]))$_link[$i]="[url=http://]http://".$host.$_link[$i[/url]];
if($str)#链接输出方式
{
if($str1)$Check ="checked";
$link[] ="<input type=\"checkbox\" $Check name=\"urlL[]\" value=".$_link[$i].">".$_link[$i];
}
else
{
$link[] =$_link[$i];
}
}
if($link)$link = array_unique($link);
return $link;
}
#获取页面文本内容
function textpage($contents,$open,$end,$del)
{
$_contents = newdata(foo($contents),foo($open),foo($end));
$message = preg_replace("/(<br>|<br \/>|<p>|<p \/>)/"," \n ",$_contents);
$message = preg_replace("@\<a(.*?)\</a\>@is", "",$message);
$message = preg_replace("/width>/","",$message);
$message = trim(ltrim(strip_tags(preg_replace("@<(.*?)>@is", "",$message))));
if($del)$message = preg_replace("/$del/","",$message);
#输出采集的内容
$text =preg_replace("/( | | )/","",$message);
$text =preg_replace("/[\n|\r]+/","\n",$text);
$text =preg_replace("/[\(\)]+/","",$text);
$text =preg_replace("/\[\]/","",$text);
return $text;
}
#获取页面图片url并缓存到本地
function mappage($contents,$open,$end,$path,$imgsize,$str)
{
$imgcont = newdata($contents,$open,$end);
$imgarray = map($imgcont);
$map = outImage($imgarray,$path="zonghe",$str,$imgsize);
if($map)$img= array_unique($map);
return $img;
}
function mapcj($contents,$path,$str)
{
$contents = ereg_replace("<","\n<",$contents);
$arr=array();
preg_match_all("/(http:\/\/[a-z0-9\/\-_+=.~!%@?#%&;:$\\()|]+\.(jpg|gif))/isU",$contents,$arr);
foreach(array_unique($arr[1]) as $key => $value)
{
$map[]= getImage(trim($value),$path,1,$str1,$key);
}
return array_unique($map);
}
#创建采集日志文件
function logfiles($str,$file)
{
$logefile = $_SERVER['DOCUMENT_ROOT']."/".$file; //--
$fo = fopen($logefile,'r');
$loge = @(fread($fo,filesize("$fo"))).$str;
$f = @fopen("$logefile",'a');
fputs($f,$loge);
fclose($f);
}
?>[/url]

caiji.rar (11.29 KB)
作者: cyhchenz 发布时间: 2008-11-25
//--字符转义
$open_T =stripslashes($open_T);
$end_T =stripslashes($end_T);
$open_M =stripslashes($open_M);
$end_M =stripslashes($end_M);
$open_M2 =stripslashes($open_M2);
$end_M2 =stripslashes($end_M2);
$d_link =stripslashes($d_link);
$del_t =stripslashes($del_t);
$del_f =stripslashes($del_f);
$del_l =stripslashes($del_l);
$d_link =stripslashes($d_link);
$surl =stripslashes($surl)." "; #--补全url
if($del_lA)$del_l=$del_lA;
#-同组任务循环采集
if(${"link_url1_".$url_v2} && ${"link_url2_".$url_v2} && $url_v2!="new")
{
if(${"numpage_".$url_v2}) #-任务递减
{
${"_numpage_".$url_v2}=(${"link_url1_".$url_v2}-${"numpage_".$url_v2});
}
else ${"_numpage_".$url_v2} =${"link_url1_".$url_v2};
if(${"_numpage_".$url_v2} >=${"link_url2_".$url_v2})
{
$openurl =${"link_url_".$url_v2}.${"_numpage_".$url_v2}.${"link_url3_".$url_v2};
setcookie("link_url_".$url_v2, ${"link_url_".$url_v2});
setcookie("link_url1_".$url_v2, ${"link_url1_".$url_v2});
setcookie("link_url2_".$url_v2, ${"link_url2_".$url_v2});
setcookie("link_url3_".$url_v2, ${"link_url3_".$url_v2});
setcookie("numpage_".$url_v2, ${"numpage_".$url_v2}+1);
$cjzt ="1";
$cjzt1 ="累计".(${"link_url1_".$url_v2}-${"link_url2_".$url_v2}+1)."组,正在采集第".(${"numpage_".$url_v2}+1)."组,";
}
if(${"_numpage_".$url_v2}==${"link_url2_".$url_v2})setcookie("numpage_".$url_v2, "");
if(${"link_url2_".$url_v2} > (${"_numpage_".$url_v2}))
{
setcookie("link_url_".$url_v2, "");
setcookie("link_url1_".$url_v2, "");
setcookie("link_url2_".$url_v2, "");
setcookie("link_url3_".$url_v2, "");
${"numpage_".$url_v2} ="";
$cjzt ="";
$cjzt1 ="采集任务结束!";
}
}
$_openurl =explode(" ",$openurl);
$Nopenurl =trim($_openurl[0]);
$contents =@file_get_contents($Nopenurl); #打开远程页面
$contents =guolv($contents);#--代码转换
$contents =formaturl($contents,$Nopenurl);#补全网址
$title =title($contents,$open_T,$end_T,$del_t);#-获取标题
if($url_v3=="link")
{
$linkarray =linkpage($contents,$open,$end,$del_l,$Nopenurl,$str=1,$str1=$cjzt);#采集页面链接
}
if($url_v3=="body")
{
if($open_M2 && $end_M2)$linkarrayn =linkpage($contents,$open_M2,$end_M2,$del="",$Nopenurl,$str="",$str1="");#采集页面链接
$text =textpage($contents,$open_M,$end_M,$del_f);#输出提取的内容
$img =mappage($contents,$open_M,$end_M,$path="zonghe",$imgsize,$str="new");#-采集页面图片
}
?>
作者: cyhchenz 发布时间: 2008-11-25
if($cjzt==1 && $url_v3=="link" && $url_v2!="new")echo "<script>setTimeout('frm.submit();','1000')</script><form action=\"?cjxx_".$url_v2."_all_".$url_v4."\" name=\"frm\" method=\"post\" target=\"_self\">";
?>
<form style="margin: 0px;" method="POST" NAME="PostTopic" ENCTYPE="multipart/form-data" action="" onSubmit="return document.returnValue;">
<?
if($url_v2=="new" || $url_v3=="up")
{
#----------------------------------基本设置
include("$pathA/files/add_form.php");
}
#-------------------------制定和更新采集任务结束
#-------------------------采集任务开始
if(($url_v3=="link")&& $url_v2!="new")
{
?>
<div id="cd"><input class=int TYPE="button" VALUE="返回上一步" ONCLICK="history.back(-1)">
<input class=int TYPE="button" VALUE="重新编辑" ONCLICK="location.href='?cjxx_<?=$url_v2?>_up'">
<input class=int TYPE="submit" NAME="save" VALUE="入库保存" class=buttonface onclick="document.PostTopic.action='?cjxx_<?=$url_v2?>_all';validateForm(); return document.returnValue;">
<input class=int TYPE="submit" NAME="cj" VALUE="重新采集" class=buttonface onclick="document.PostTopic.action='?cjxx_<?=$url_v2?>_link';validateForm(); return document.returnValue;">
</div>
<?
if($url_v3=="link")
{
?>
<div id="cd">
<table border="0" width="100%">
<tr><td class=s>链接采样</td><td class=s>
<input class=int TYPE="text" name="link_url_<?=$url_v2?>" size="50" value="<? if(${"link_url_".$url_v2})echo ${"link_url_".$url_v2};else echo "$linkurl";?>">
开始页: <input class=int TYPE="text" name="link_url1_<?=$url_v2?>" size="6" value="<?=${"link_url1_".$url_v2}?>" title="开始页必须大于结束页">
结束页: <input class=int TYPE="text" name="link_url2_<?=$url_v2?>" size="6" value="<?=${"link_url2_".$url_v2}?>">
补充: <input class=int TYPE="text" name="link_url3_<?=$url_v2?>" size="6" value="<?=${"link_url3_".$url_v2}?>">
</td></tr>
<tr><td class=s>内容采样</td><td class=s><input class=int TYPE="text" name="gopage" size="100" value="<?=$gopage?>"></td></tr></table>
</div>
<div id="cd">
<input type=checkbox value="Check All" $Check onclick="mm(this)">选择全部 <? echo $cjzt1." 页面:".$Nopenurl; ?>
</div>
<?
if($cjzt)
{
?>
<div id="cd">
<marquee style="border:1px solid #000000" direction="right" width="99%" scrollamount="5" scrolldelay="10" bgcolor="#ECF2FF" title="正在执行采集任务">
<table cellspacing="1" cellpadding="0">
<tr height=3>
<td bgcolor=#3399FF width=8></td>
<td></td>
<td bgcolor=#3399FF width=8></td>
<td></td>
<td bgcolor=#3399FF width=8></td>
<td></td>
<td bgcolor=#3399FF width=8></td>
<td></td>
</tr></table></marquee>
</div>
<?
}
?>
<div id="cs">
<?
#-------------------------------输出采集的链接
if($linkarray)
{
foreach($linkarray as $i=> $v)
{
$linklist =ereg_replace("'","",$v)."<br>";
if($d_link && !eregi("$d_link",$linklist))$linklist="";
echo $linklist;
}
}
?>
</div>
<?
}
}
#-------------------------------采集入库
if($url_v3=="all")
{
echo "<div id=\"cd\"><b>累计采集任务: $i </b> $id_array[$c] --- $cjzt1 </div><div id=\"cs\">";
$log = $_SERVER['DOCUMENT_ROOT']."/admin/logfile/".$url_v2.$url_v4;
if(file_exists("$log"))
{
$fo =fopen($log,'r');
$message = @(fread($fo,filesize("$log")));
echo $message."</div>";
}
else echo $title_l."</div>";
}
if($url_v2=="class")//---------------分类类别
{
delcache($dir="../logfile/");
?>
<div id=cd>
<input class=int TYPE="submit" NAME="Submit" VALUE="批量采集" class=buttonface onclick="document.PostTopic.action='?<?=$url_v1?>__link_<?=rand(5,90000)?>';validateForm(); return document.returnValue;">
<input class=int TYPE="submit" NAME="Submit" VALUE="批量删除" class=buttonface onclick="document.PostTopic.action='?<?=$url_v1?>_del_<?=$page?>';validateForm(); return document.returnValue;">
</div>
<table border="0" width="100%">
<tr>
<td width="5%" class=int><input type=checkbox value="Check All" onclick="mm(this)" title="全选/全不选"></td>
<td width="10%" class=int><b>操作方式</td>
<td width="25%" class=int><b>任务标题</td>
<td width="50%" class=int><b>备注</td>
</tr>
<?
$where ="where industry ='$url_v3'";
$result = mysql_query("SELECT * from $url_v1 $where ORDER by gourl");
while ($row = mysql_fetch_array($result))
{
echo <<<html
<tr title="$row[UserID]">
<td class=int><input type="checkbox" name="box[]" value="$row[UserID]"></td>
<td class=int>[<a href='?cjxx_$row[UserID]_link'>选采</a>][<a href='?cjxx_$row[UserID]_up'>更新</a>]</td>
<td class=td>$row[worktitle]</td>
<td class=td>$row[gourl]</td>
</tr>
html;
}
echo "</table></form>";
}
?>
</form>
作者: cyhchenz 发布时间: 2008-11-25
<input
class=int type="button" value="采集设置" onclick="javascript:toExit('show','b6')"><input
class=int TYPE="submit" NAME="Submit" VALUE="链接采样" class=buttonface onclick="document.PostTopic.action='?cjxx_new_link';validateForm(); return document.returnValue;" title="点击获取目标页面源码及页面链接"><input
class=int TYPE="submit" NAME="Submit" VALUE="内容采样" class=buttonface onclick="document.PostTopic.action='?cjxx_new_body';validateForm(); return document.returnValue;" title="点击获取目标页面源码及内容,分链和图片等采样信息"><?if($url_v3=="link"){?><input
type="button" class=int value="预览链接" onclick="javascript:toExit('show','b2')"><?}else{?><input
type="button" class=int value="预览内容" onclick="javascript:toExit('show','b3')"><input
type="button" class=int value="预览内链" onclick="javascript:toExit('show','b4')"><input
type="button" class=int value="预览图片" onclick="javascript:toExit('show','b5')"><?}?><input
class=int TYPE="submit" NAME="Submit" VALUE="<?if($upts)echo"测试变更";else echo"保存规则";?>" class=buttonface onclick="document.PostTopic.action='?cjxx_new_save';validateForm(); return document.returnValue;" <?if(!$upts)echo"title=\"请选择类别和分类后保存\"";?>>
<select class=int name="s1" onChange="redirec(document.PostTopic.s1.options.selectedIndex)">
<option selected>选择类别</option>
<option <? if($industry==1)echo"selected";?> value="1">娱乐</option>
<option <? if($industry==2)echo"selected";?> value="2">女性</option>
<option <? if($industry==3)echo"selected";?> value="3">新闻</option>
<option <? if($industry==4)echo"selected";?> value="4">交流</option>
<option <? if($industry==5)echo"selected";?> value="5">商务</option>
</select>
<select name="s2" class=int>
<option <?if($sort!="")echo"value=\"$sort\" selected>已选择</option>";else echo">选择分类</option>";?>
</select>
<script language="javascript">
var select1_len = document.PostTopic.s1.options.length;
var select2 = new Array(select1_len);
for (i=0; i<select1_len; i++)
{
select2 = new Array();
}
select2[0][0] = new Option("请选择", " ");
<? echo s_l(1,1)." \n \n".s_l(2,1)." \n \n".s_l(3,1)." \n \n".s_l(4,1)." \n \n".s_l(5,1)." \n \n";?>
function redirec(x)
{
var temp = document.PostTopic.s2;
for (i=0;i<select2[x].length;i++)
{
temp.options=new Option(select2[x].text,select2[x].value);
}
temp.options[0].selected=true;
}
</script>
</div>
<div id="b6" class=cj title="基本设置">
<div align="right"><input type="button" class=int value=" 关 闭 " onclick="javascript:toExit('hide','b6')"></div>
<table border="0" width="100%" class=int>
<tr><td width="20%" class=s>任务标题:</td><td width="80%" class=s><input class=int type=text name="worktitle" size=45 value="<?=$worktitle?>"></td></tr>
<tr><td class=s>任务网址:</td><td class=s><input class=int class=int type=text name="gourl" value="<?=$gourl?>" size="45"></td></tr>
<!--
<tr><td class=s>补全网址:<td class=s><input class=int type=text name="surl" value="<?=$surl?>" size="45"></td></tr>
-->
</table>
<table border="0" width="100%" title="采集链接部分" class=int>
<tr><td width="100%" colspan="2"><p align="center"><b>采集链接部分</b></td></tr>
<tr><td width="20%" class=s>取样网址:</td><td width="80%" class=s><input class=int type=text name="linkurl" value="<?=stripslashes($linkurl)?>" size="45"></td></tr>
<tr><td class=s>过滤字符:</td><td class=s><textarea rows="1" name="del_l" cols="40" title="多个字符过滤使用|隔断 "><?=stripslashes($del_l)?></textarea></td></tr>
<tr title="仅提取包含以下字符的链接"><td class=s>筛选链接:</td><td class=s><textarea rows="1" name="d_link" cols="40"><?=stripslashes($d_link)?></textarea></td></tr>
</table>
<table border="0" width="100%" class=int>
<tr><td width="100%" colspan="2"><p align="center"><b>内容采样部分</b></td></tr>
<tr><td width="20%" class=s>取样网址:</td><td width="80%" class=s><input class=int type=text name="gopage" value="<?=$gopage?>" size="45"></td></tr>
<tr><td class=s>过滤字符:</td><td class=s><textarea rows="1" name="del_f" cols="40"><?=stripslashes($del_f)?></textarea></td></tr>
<tr><td class=s>标题过滤:</td><td class=s><input class=int type=text name="del_t" value="<?=stripslashes($del_t)?>"></td></tr>
<tr><td class=s>标题上部:</td><td class=s><textarea rows="1" name="open_T" cols="40"><?=stripslashes($open_T)?></textarea></td></tr>
<tr><td class=s>标题下部:</td><td class=s><textarea rows="1" name="end_T" cols="40"><?=stripslashes($end_T)?></textarea></td></tr>
<tr><td class=s>内容上部:</td><td class=s><textarea rows="1" name="open_M" cols="40"><?=stripslashes($open_M)?></textarea></td></tr>
<tr><td class=s>内容下部:</td><td class=s><textarea rows="1" name="end_M" cols="40"><?=stripslashes($end_M)?></textarea></td></tr>
<tr><td class=s>内分页上部:</td><td class=s><textarea rows="1" name="open_M2" cols="40"><?=stripslashes($open_M2)?></textarea></td></tr>
<tr><td class=s>内分页下部:</td><td class=s><textarea rows="1" name="end_M2" cols="40"><?=stripslashes($end_M2)?></textarea></td></tr>
<tr><td class=s>图片规格:</td><td class=s><input class=int type=text name="imgsize" value="<?=$imgsize?>" size="20" title="使用排列方式:大小,宽度,高度">默认:>2048字节,宽150,高150</td></tr>
</table>
</div>
<div id="b3" class=cj1 title="采集内容">
<div align="left"><input type="button" class=int value=" 关 闭 " onclick="javascript:toExit('hide','b3')"></div>
<textarea rows="25" cols="80"><?=$text?></textarea>
</div>
<div id="b4" class=cj1 title="关联分页">
<div align="left"><input type="button" class=int value=" 关 闭 " onclick="javascript:toExit('hide','b4')"></div>
<textarea rows="10" cols="80">
<?
if($linkarrayn)
{
$page_url =substr($Nopenurl,0,strrpos("$Nopenurl","/"));
foreach($linkarrayn as $i=> $v)
{
if(eregi("$page_url",$v)&& $Nopenurl!=$v)echo ereg_replace("'","",$v)."\n";
}
}
?>
</textarea>
</div>
<div id="b5" class=cj1 title="采集图片">
<div align="left"><input type="button" class=int value=" 关 闭 " onclick="javascript:toExit('hide','b5')"></div>
<textarea rows="10" cols="80" wrap="off" title="超出采集规格的文件将不保存到本域下">
<?
if($img)
{
echo implode(",",array_unique($img));
}
?>
</textarea>
</div>
<div id=cd> 采样网址: <input type="text" disabled value="<?=$openurl?>" size=40>
采集标题: <input type="text" value="<?=$title?>" disabled size=40></div>
<table border="0" width="100%" class=int><tr>
<td bgcolor="#D0E8FF"><p align="left"> 站点源码<input name=wrap type=checkbox onclick="if(this.checked){document.PostTopic.source.wrap='hard'}else{document.PostTopic.source.wrap='off'}">代码换行<br>
</td></tr><td valign="top" height="458"><textarea rows="29" cols="112" name="source" wrap="off" style="border: 0px; border-top: 1px solid #c0c0c0;color:#0060BF"><?=$contents?></textarea><br>
</td></tr>
</table>
<div id="b2" class=cj1 title="采集页面链接">
<div id="lk">
<div align="left"><input class=int type="button" value=" 关 闭 " onclick="javascript:toExit('hide','b2')"></div>
<input type=checkbox value="Check All" $Check onclick="mm(this)">选择全部<br>
<?
if($linkarray)#---------------------------------输出采集的链接
{
foreach($linkarray as $i=> $v)
{
$linklist =ereg_replace("'","",$v)."<br>";
if($d_link && !eregi("$d_link",$linklist))$linklist="";
echo $linklist;
}
}
?>
</div>
</div>
作者: cyhchenz 发布时间: 2008-11-25
if(($box||${"id_array_".$url_v4})&&($url_v3=="link" && $url_v2!="new")) #---------------------------多组任务批量采集
{
if($box)${"id_array_".$url_v4} = implode(",",$box);
${"str_id_".$url_v4} = explode(',',${"id_array_".$url_v4});
${"num_".$url_v4} = count(${"str_id_".$url_v4});
if(${"num_page_".$url_v4})${"_numpage_".$url_v4}=${"num_page_".$url_v4};
else
{
${"_numpage_".$url_v4} =0;
}
$url_v2 =${"str_id_".$url_v4}[${"_numpage_".$url_v4}];
$cjzt =1;
${"_num_".$url_v4} =(${"_numpage_".$url_v4}+1);
setcookie("num_page_".$url_v4, ${"_num_".$url_v4});
setcookie("id_array_".$url_v4, ${"id_array_".$url_v4});
$cjzt1 ="累计".${"num_".$url_v4} ."组,正在采集第".${"_num_".$url_v4} ."组,";
if(${"numpage_".$url_v4}>=${"_num_".$url_v4})
{
setcookie("numpage_".$url_v4, "");
setcookie("id_array_".$url_v4, "");
$cjzt ="";
$cjzt1 ="采集任务结束!";
}
}
#---------------------------多组任务批量采集(结束)
#-------------------------------------------------------获取任务参数
if(eregi("^[0-9]+$","$url_v2"))
{
$where = "UserID = '$url_v2'";
$result = result($url_v1,$where,$limit);
$row = mysql_fetch_array($result);
$industry =$row['industry'];
$table = data_table($industry); #----入库表格
$sort =$row['sort']; #----信息分类
$Ti =$row['Ti'];
$worktitle =$row['worktitle'];
$gourl =$row['gourl'];
$linkurl =$row['linkurl'];
$surl =$row['surl'];
$gopage =$row['gopage'];
$open_T =$row['open_T'];
$end_T =$row['end_T'];
$open_M =$row['open_M'];
$end_M =$row['end_M'];
$open_M2 =$row['open_M2'];
$end_M2 =$row['end_M2'];
$imgsize =$row['imgsize'];
$Opage =$row['Opage'];
$type =$row['type'];
$d_link =$row['d_link'];
$del_t =$row['del_t'];
$del_f =foo($row['del_f']);
$del_l =$row['del_l']; #-------替换链接中多余的字符
$d_link =$row['d_link']; #-------
$upts =1;
$openurl =$linkurl;
if($link_url)$openurl =$link_url;
if($url_v3=="link")include("$pathA/files/cj_work.php"); #-提取链接
if($url_v3=="all") #---------------------------采集入库
{
$urlL = array_unique($urlL);
$time =date("y-m-d");
$data = "`UserID` , `leibie` , `title` , `img` , `time`";
foreach($urlL as $i=> $v)
{
$_openurl[$i] =explode(" ",$v);
$Nopenurl[$i] =trim($_openurl[$i][0]);
$contents[$i] =@file_get_contents($Nopenurl[$i]); #----------------------------------------------打开远程页面
$contents[$i] =guolv($contents[$i]);#-----------------------------------------------------------代码转换
$contents[$i] =formaturl($contents[$i],$Nopenurl[$i]);#-------------------------------------------------补全网址
$_title[$i] =title($contents[$i],$open_T,$end_T,$del_t);#-------------------------------------获取标题
if($_title[$i])
{
$rn[$i] = rn_num($table,$where=" title ='$_title[$i]'");
if($rn[$i]<1)
{
if($open_M2 && $end_M2)$linkarray[$i] =linkpage($contents[$i],$open_M2,$end_M2,$del,$Nopenurl[$i],$str="",$str1="");#----------------------采集页面链接
$imgarray[$i] =mappage($contents[$i],$open_M,$end_M,$path="zonghe",$imgsize,$str="all");#-------采集页面图片接
if($imgarray[$i])$_img[$i]=implode(",",array_unique($imgarray[$i]));
$_text[$i] =textpage($contents[$i],$open_M,$end_M,$del_f);#----------------------------------输出提取的内容
if($_img[$i]=="" && $_text[$i]=="")
{
$title_l= $title_l."<br>未采集到内容! $v";
}
else
{
$_text[$i] = strreplace($_text[$i]);
$addto[$i] = ad_sql($table,$data,$newdata="'','$sort','$_title[$i]','$_img[$i]','$time'",$_newdata="$_text[$i]");
}
#---------------------------------------------主信息入库
if($addto[$i])
{
#----------------------------------------------分页采集
if($linkarray[$i])
{
$page_url =substr($Nopenurl[$i],0,strrpos("$Nopenurl[$i]","/"));
foreach($linkarray[$i] as $e => $r)
{
if(eregi("$page_url",$r)&& $Nopenurl[$i]!= $r)
{
$pageurl1[$e] =explode(" ",$r);
$_gopage[$e] =trim($pageurl1[$e][0]);
if($Nopenurl[$i] != $_gopage[$e])
{
$_contents[$e] =@file_get_contents($_gopage[$e]); #----------------------------------------------打开远程页面
$_contents[$e] =guolv($_contents[$e]);#-----------------------------------------------------------代码转换
$_contents[$e] =formaturl($_contents[$e],$_gopage[$e]);#-------------------------------------------------补全网址
$imgarray1[$e] =mappage($_contents[$e],$open_M,$end_M,$path="zonghe",$imgsize,$str="all");#-------采集页面图片接
if($imgarray1[$e])$_img1[$e]=implode(",",array_unique($imgarray1[$e]));
$_text1[$e] =textpage($_contents[$e],$open_M,$end_M,$del_f);#----------------------------------输出提取的内容
$newdata1[$e] = "'','$sort','$_title[$i]','$_img1[$e]','$time'";
if($_img1[$e]=="" && $_text1[$e]=="")
{
$title_l= $title_l."<br>".$e."----分页无内容! - $r";
}
else #-----------------------------------------------入库
{
$addto1[$e] = ad_sql($table,$data,$newdata1[$e],$_newdata="$_text1[$e]");
}
if($addto1[$e])
{
$title_l= $title_l."<br>".$addto1[$e]." --分页: ".$_title[$i];
}
else
{
delmap($_img1[$e]);
$title_l= $title_l."<br>".$e." ----分页链接,入库失败!:".$_gopage[$e];
}
}
}
}
}
#----------------------------------------------分页采集结束
$title_l= $title_l."<br>".$addto[$i]." 主信息: ".$_title[$i]."";
}
else
{
delmap($img[$i]);
$title_l= $title_l."<br>".$i." ----未入库!:".$_title[$i];
}
}
else
{
delmap($img[$i]);
$title_l= $title_l."<br>".$i."----重复!:".$Nopenurl[$i];
}
}
}
#----------------采集内容结束
#----------------多任务循环-
if(${"numpage_".$url_v2})
{
setcookie("link_url_".$url_v2, ${"link_url_".$url_v2});
setcookie("link_url1_".$url_v2, ${"link_url1_".$url_v2});
setcookie("link_url2_".$url_v2, ${"link_url2_".$url_v2});
setcookie("link_url3_".$url_v2, ${"link_url3_".$url_v2});
setcookie("numpage_".$url_v2, ${"numpage_".$url_v2});
setcookie("title_l_".$url_v2, ${"title_l_".$url_v2}."<br>".$title_l);
logfiles($str="$title_l",$file="logfile/".$url_v2);
Header("Location:admin.php?cjxx_".$url_v2."_link");
}
if(${"num_page_".$url_v4})
{
setcookie("num_page_".$url_v4, ${"num_page_".$url_v4});
setcookie("id_array_".$url_v4, ${"id_array_".$url_v4});
logfiles($str="$title_l",$file="admin/logfile/".$url_v4);
Header("Location:admin.php?cjxx__link_$url_v4");
}
}
}
if($url_v2=="new") #---------------------------测试和方案保存部分
{
if($url_v3=="link")$openurl=$linkurl;
if($url_v3=="body")$openurl=$gopage;
include("$pathA/files/cj_work.php");
}
?>
作者: cyhchenz 发布时间: 2008-11-25
作者: andsky 发布时间: 2008-11-25

作者: ly5 发布时间: 2008-11-26
作者: MeiWei 发布时间: 2008-11-26
作者: ieliwb 发布时间: 2008-11-26
作者: 0hudu 发布时间: 2008-11-26
作者: fireseno 发布时间: 2008-12-01
热门阅读
-
office 2019专业增强版最新2021版激活秘钥/序列号/激活码推荐 附激活工具
阅读:74
-
如何安装mysql8.0
阅读:31
-
Word快速设置标题样式步骤详解
阅读:28
-
20+道必知必会的Vue面试题(附答案解析)
阅读:37
-
HTML如何制作表单
阅读:22
-
百词斩可以改天数吗?当然可以,4个步骤轻松修改天数!
阅读:31
-
ET文件格式和XLS格式文件之间如何转化?
阅读:24
-
react和vue的区别及优缺点是什么
阅读:121
-
支付宝人脸识别如何关闭?
阅读:21
-
腾讯微云怎么修改照片或视频备份路径?
阅读:28