Created by shang on 16/9/9.
*/
<span class="hljs-keyword">public <span class="hljs-class"><span class="hljs-keyword">class <span class="hljs-title">JianShuProcessor <span class="hljs-keyword"><span class="hljs-keyword">implements <span class="hljs-type">PageProcessor {
<span class="hljs-keyword">private Site site = Site.me()
.setDomain(<span class="hljs-string">"jianshu.com")
.setSleepTime(<span class="hljs-number">100)
.setUserAgent(<span class="hljs-string">"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/52.0.2743.116 Safari/537.36");
;
<span class="hljs-keyword">public <span class="hljs-keyword">static final <span class="hljs-keyword">String list = <span class="hljs-string">"http://www.jianshu.com";
@Override
<span class="hljs-keyword">public void process(Page page) {
<span class="hljs-keyword">if (page.getUrl().regex(list).match()) {
List list=page.getHtml().xpath(<span class="hljs-string">"//ul[@class='article-list thumbnails']/li").nodes();
<span class="hljs-keyword">for (Selectable s : <span class="hljs-type">list) {
<span class="hljs-keyword">String title=s.xpath(<span class="hljs-string">"//div/h4/a/text()").toString();
<span class="hljs-keyword">String link=s.xpath(<span class="hljs-string">"//div/h4").links().toString();
News <span class="hljs-keyword">new<span class="hljs-type">s=<span class="hljs-keyword">new <span class="hljs-type">News();
<span class="hljs-keyword">new<span class="hljs-type">s.setTitle(title);
<span class="hljs-keyword">new<span class="hljs-type">s.setInfo(title);
<span class="hljs-keyword">new<span class="hljs-type">s.setLink(link);
<span class="hljs-keyword">new<span class="hljs-type">s.setSources(<span class="hljs-keyword">new <span class="hljs-type">Sources(<span class="hljs-number">5));
page.putField(<span class="hljs-string">"news"+title,<span class="hljs-keyword">new<span class="hljs-type">s);
}
}
}
@Override
<span class="hljs-keyword">public Site getSite() {
<span class="hljs-keyword">return site;
}
<span class="hljs-keyword">public <span class="hljs-keyword">static void main(<span class="hljs-keyword">String[] args) {
Spider spider=Spider.create(<span class="hljs-keyword">new <span class="hljs-type">JianShuProcessor());
spider.addUrl(<span class="hljs-string">"http://www.jianshu.com");
spider.addPipeline(<span class="hljs-keyword">new <span class="hljs-type">NewsPipeline());
spider.thread(<span class="hljs-number">5);
spider.setExitWhenComplete(<span class="hljs-literal">true);
spider.start();
}
}
2.3 入库模块Pipeline
入库模块结合spring boot的Repository模块一起组合成入库方法,继承webmagic的Pipeline,然后实现方法,在process方法中获取爬虫模块的数据,然后调用spring boot的save方法。代码如下:
<span class="hljs-keyword">import com.shang.spray.entity.News;
<span class="hljs-keyword">import com.shang.spray.entity.Sources;
<span class="hljs-keyword">import com.shang.spray.repository.NewsRepository;
<span class="hljs-keyword">import org.apache.commons.lang3.StringUtils;
<span class="hljs-keyword">import org.springframework.beans.factory.annotation.Autowired;
<span class="hljs-keyword">import org.springframework.data.jpa.domain.Specification;
<span class="hljs-keyword">import org.springframework.stereotype.Repository;
<span class="hljs-keyword">import us.codecraft.webmagic.ResultItems;
<span class="hljs-keyword">import us.codecraft.webmagic.Task;
<span class="hljs-keyword">import us.codecraft.webmagic.pipeline.Pipeline;
<span class="hljs-keyword">import javax.persistence.criteria.CriteriaBuilder;
<span class="hljs-keyword">import javax.persistence.criteria.CriteriaQuery;
<span class="hljs-keyword">import javax.persistence.criteria.Predicate;
<span class="hljs-keyword">import javax.persistence.criteria.Root;
<span class="hljs-keyword">import java.util.ArrayList;
<span class="hljs-keyword">import java.util.Date;
<span class="hljs-keyword">import java.util.List;
<span class="hljs-keyword">import java.util.Map;
<span class="hljs-comment">/**
-
info:新闻
-
Created by shang on 16/8/22.
*/
@Repository
<span class="hljs-keyword">public <span class="hljs-class"><span class="hljs-keyword">class <span class="hljs-title">NewsPipeline <span class="hljs-keyword"><span class="hljs-keyword">implements <span class="hljs-type">Pipeline {
@Autowired
protected NewsRepository <span class="hljs-keyword">new<span class="hljs-type">sRepository;
@Override
<span class="hljs-keyword">public void process(ResultItems resultItems,Task task) {
<span class="hljs-keyword">for (Map.Entry<<span class="hljs-keyword">String,Object> entry : <span class="hljs-type">resultItems.getAll().entrySet()) {
<span class="hljs-keyword">if (entry.getKey().contains(<span class="hljs-string">"news")) {
News <span class="hljs-keyword">new<span class="hljs-type">s=(News) entry.getValue();
Specification specification=<span class="hljs-keyword">new <span class="hljs-type">Specification() {
@Override
<span class="hljs-keyword">public Predicate toPredicate(Root root,CriteriaQuery<?> criteriaQuery,CriteriaBuilder criteriaBuilder) {
<span class="hljs-keyword">return criteriaBuilder.and(criteriaBuilder.equal(root.<span class="hljs-keyword">get(<span class="hljs-string">"link"),<span class="hljs-keyword">new<span class="hljs-type">s.getLink()));
}
};
<span class="hljs-keyword">if (<span class="hljs-keyword">new<span class="hljs-type">sRepository.findOne(specification) == <span class="hljs-literal">null) {<span class="hljs-comment">//检查链接是否已存在
<span class="hljs-keyword">new<span class="hljs-type">s.setAuthor(<span class="hljs-string">"水花");
<span class="hljs-keyword">new<span class="hljs-type">s.setTypeId(<span class="hljs-number">1);
<span class="hljs-keyword">new<span class="hljs-type">s.setSort(<span class="hljs-number">1);
<span class="hljs-keyword">new<span class="hljs-type">s.setStatus(<span class="hljs-number">1);
<span class="hljs-keyword">new<span class="hljs-type">s.setExplicitLink(<span class="hljs-literal">true);
<span class="hljs-keyword">new<span class="hljs-type">s.setCreateDate(<span class="hljs-keyword">new <span class="hljs-type">Date());
<span class="hljs-keyword">new<span class="hljs-type">s.setModifyDate(<span class="hljs-keyword">new <span class="hljs-type">Date());
<span class="hljs-keyword">new<span class="hljs-type">sRepository.save(<span class="hljs-keyword">new<span class="hljs-type">s);
}
}
}
}
}
2.4 定时任务模块Scheduled
(编辑:莱芜站长网)
【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!