首页 | 新闻 | 交流 | 问吧 | 文档 | 手册 | 下载 | 博客

收藏此问题 发表新评论

采集前程无忧数据

本来是做.NET的,因为工作的需要,要做个统计分析,写了个简单的采集 ,一共采集了210份 直接导出为EXCEL ,有点慢  没有优化代码,以后有时间再改
<?php
/**
*
job.php
采集前程无忧
*/
ob_start();
set_time_limit(0);  
error_reporting(E_ERROR || E_PARSE);
header("Content-type:application/vnd.ms-excel");
header("Content-Disposition:filename=report.xls");
echo "<table cellspacing=1 cellpadding=2  border=1 bgColor=#000000><tr bgColor=#ffffff><th>编号</th><th>公司名称</th><th>岗位名称</th><th>岗位需求</th></tr>";
require "id.php";
$allidarray=split(",",$allid);
for($i=0;$i<count($allidarray);$i++){
getContent($i+1,$allidarray[$i]);
}
//内容采集
function getContent($pid,$id){
$c("http://search.51job.com/jobsearch/show_job_detail.php?id=($id)");
$start1='#title02">(.*)</TD#';
$start2='/padding-left:15px;">(.+?)<\/td><\/tr>/si';
$start3='/<\/strong><br>(.+?)<\/p>/si';
preg_match_all($start1,$content,$r1);
preg_match_all($start2,$content,$r2);
preg_match_all($start3,$content,$r3);
echo "<tr bgColor=#ffffff>";
echo("<td>".$pid."</td>");//编号
echo("<td>".$r1[1][0]."</td>");//公司名
echo("<td>".$r2[1][0]."</td>");//职务
echo("<td>".$r3[1][0]."</td>");//公司需求
echo "</tr>";
}
echo "</table>";
//统计类别
function sumByClass($class){

}
?>

id.php
<?php
$allid="36051870,36051844,36196171,36195844,36195660,36462530,36530797,36475186,36474917,35802876,36326650,35517798,36495332,35779826,36416344,36482369,36509147,31167200,36361534,108413120,34974884,34986995,36523857,36523766,35062366,36493056,35994127,34749689,35513294,34673330,36482238,34639589,34630863,34630728,29979964,36507893,36081614,36485673,31507428,24701600,31507421,36428571,36365008,36072176,36072173,36035380,34548045,34548024,35405170,35814410,35814122,36431980,35814558,36370665,36370732,36459631,36459574,34796169,36048346,30275535,35080005,36167022,36017432,36360104,36360100,36323399,36295193,36295003,36287470,36360105,36360103,36360102,36360101,36360099,36323408,36323355,36287497,36472961,36496776,36498452,35738969,36440896,36306882,36495647,36526781,36526595,36526524,36333722,36493698,36049831,108413302,36256875,36183259,36506040,36090949,36350581,36343970,36360345,36303801,34975247,35822839,35822895,35587981,35990090,36240818,36513317,36513313,108412937,36449622,36409164,36508119,36206383,36194950,36194943,35939761,36399033,35523021,30986244,36314678,35945747,19939677,36183057,108413535,35099403,35099514,36495412,36424393,108411873,108412247,36520185,36058526,35924198,36467969,36416956,36156818,36434541,35533725,36465699,36503228,36205095,36108116,108412600,108411781,36501356,108420293,35839369,108413448,108419279,36422961,108413374,108418665,108418221,108418097,108418113,108417810,108417810,108417832,108417292,36456791,108413343,36358634,35688703,34995273,108413204,108416358,108415848,108415862,36160508,108415010,108414676,108414688,36362121,108413934,36054406,108413794,36191589,36506032,36173634,36173600,36490463,36489996,35352503,108359954,34882938,36479933,36453896,108410882,36501168,36319672,36323072,36320547,36181536,29979658,29979581,34988483,36428523,36473557,108393184,36354538,36354477,36406228,36406161,36393289,108390202,108387095,108387279,36100797,36192581,108387073,108387034";
?>
昵称: cab  时间: 2007-12-10 11:36:00
应鼓励原创精品!
昵称: luzhou  时间: 2007-12-10 20:24:00
怎么这么多ID 啊?
昵称: CFC4N  时间: 2007-12-11 10:20:00
昵称: diego  时间: 2007-12-26 09:19

学采集
昵称: diego  时间: 2007-12-26 09:19:00