验证中...
java文件
Raw Copy
public class SpiderBossTest implements PageProcessor {
public static final String URL_LIST = "https://www\\.zhipin\\.com/c101190400/\\?page=\\d+";
public static final String URL_POST = "https://www\\.zhipin\\.com/job_detail/\\w*\\~\\.html";
@Override
public void process(Page page) {
//列表页
if (page.getUrl().regex(URL_LIST).match()) {
Selectable selectable = page.getHtml().xpath("//h3[@class=\"name\"]/").links();
Console.log(selectable);
List<String> urls = selectable.regex(URL_POST).all();
if(CollectionUtil.isEmpty(urls)){
return;
}
// urls = urls.stream().map(s->s = "https://www.zhipin.com"+s).collect(Collectors.toList());
page.addTargetRequests(urls);
page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all());
//文章页
} else {
page.putField("job-status",page.getHtml().xpath("//div[@class='info-primary']/div[@class='job-status']/text()").toString());
page.putField("job-name",page.getHtml().xpath("//div[@class='info-primary']/div[@class='name']/h1/text()").toString());
page.putField("salary",page.getHtml().xpath("//div[@class='info-primary']/div[@class='name']/span[@class='salary']/text()").toString());
page.putField("job-require",page.getHtml().xpath("//div[@class='info-primary']/p/text()").toString());
page.putField("tags",page.getHtml().xpath("//div[@class='info-primary']/div[@class='tag-container']/div[@class='job-tags']/span/text()").all());
page.putField("figure",page.getHtml().xpath("//div[@class='job-detail']/div[@class='detail-op']/div[@class='detail-figure']/img").toString());
page.putField("opname",page.getHtml().xpath("//div[@class='job-detail']/div[@class='detail-op']/h2/text()").toString());
page.putField("opposition",page.getHtml().xpath("//div[@class='job-detail']/div[@class='detail-op']/p[@class='gray']/text()").toString());
//职位描述
page.putField("job-desc",page.getHtml().xpath("//div[@class='detail-content']/div[@class='job-sec'][1]/div[@class='text']/text()").toString());
page.putField("team-desc",page.getHtml().xpath("//div[@class='detail-content']/div[@class='job-sec'][2]/div[@class='text']/text()").toString());
page.putField("jobtags",page.getHtml().xpath("//div[@class='detail-content']/div[@class='job-sec']/div[@class='job-tags']/span/text()").all());
page.putField("company-name",page.getHtml().xpath("//div[@class='detail-content']/div[@class='job-sec'][5]/div[@class='name']/text()").toString());
page.putField("companydesc",page.getHtml().xpath("//div[@class='job-sec company-info']/div[@class='text']/text()").toString());
page.putField("levels",page.getHtml().xpath("//div[@class='level-list']/li/text()").all());
page.putField("location",page.getHtml().xpath("//div[@class='location-address']/text()").toString());
}
}
@Override
public Site getSite() {
return Site.me().setRetryTimes(3).setSleepTime(5000).setUserAgent(CommonConstant.userAgentArray[0]).setDomain("zhipin.com");
}
@Test
public void shuchu(){
Spider.create(new SpiderBossTest()).addPipeline(new JsonFilePipeline("D:\\webmagic")).addUrl("https://www.zhipin.com/c101190400/?page=1").run();
}
}

Comment list( 0 )

You need to Sign in for post a comment

Help Search