使用SpringBoot+Mybatis plus+Webmagic爬取51job的职位信息,并保存到mysql数据库.
创建工程
引入maven依赖
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>2.2.5.RELEASE</version> <relativePath/> <!-- lookup parent from repository --> </parent> <groupId>com.hg</groupId> <artifactId>spider-demo</artifactId> <version>0.0.1-SNAPSHOT</version> <name>spider-demo</name> <description>爬虫实战</description> <properties> <java.version>1.8</java.version> </properties> <dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <optional>true</optional> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> <dependency> <groupId>cn.hutool</groupId> <artifactId>hutool-all</artifactId> <version>5.1.0</version> </dependency> <!-- druid数据库连接池 --> <dependency> <groupId>com.alibaba</groupId> <artifactId>druid-spring-boot-starter</artifactId> <version>1.1.10</version> </dependency> <!-- mysql connector --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <scope>runtime</scope> </dependency> <!-- Mybatis-plus --> <dependency> <groupId>com.baomidou</groupId> <artifactId>mybatis-plus-boot-starter</artifactId> <version>3.0.5</version> </dependency> <!--webmagic--> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> <version>16.0</version> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> </plugin> </plugins> </build> </project> 复制代码
建表语句
创建数据库spider,新建表job_info
CREATE TABLE `job_info` ( `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键id', `company_name` varchar(100) DEFAULT NULL COMMENT '公司名称', `company_addr` varchar(200) DEFAULT NULL COMMENT '公司联系方式', `job_name` varchar(100) DEFAULT NULL COMMENT '职位名称', `job_addr` varchar(50) DEFAULT NULL COMMENT '工作地点', `salary` varchar(50) DEFAULT NULL COMMENT '薪资范围', `url` varchar(150) DEFAULT NULL COMMENT '招聘信息详情页', `time` varchar(10) DEFAULT NULL COMMENT '职位最近发布时间', `job_detail` text COMMENT '职位详情', PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=2 DEFAULT CHARSET=utf8 COMMENT='招聘信息'; 复制代码
加入配置文件
创建application.yml
spring: application: name: spider-servoce jackson: time-zone: GMT+8 date-format: yyyy-MM-dd HH:mm:ss datasource: driver-class-name: com.mysql.cj.jdbc.Driver url: jdbc:mysql://localhost:3306/spider?useUnicode=true&characterEncoding=utf8&autoReconnect=true&useSSL=false username: root password: root type: com.alibaba.druid.pool.DruidDataSource druid: initialSize: 10 minIdle: 10 maxActive: 50 maxWait: 60000 timeBetweenEvictionRunsMillis: 60000 minEvictableIdleTimeMillis: 300000 validationQuery: SELECT 1 FROM DUAL testWhileIdle: true testOnBorrow: false testOnReturn: false poolPreparedStatements: true maxPoolPreparedStatementPerConnectionSize: 20 filters: stat,wall connectionProperties: druid.stat.mergeSql=true;druid.stat.slowSqlMillis=5000 #mybatis mybatis-plus: mapper-locations: classpath:mapper/**/*.xml typeAliasesPackage: com.hg.*.entity global-config: db-config: id-type: auto field-strategy: not_empty table-underline: true db-type: mysql refresh: true configuration: map-underscore-to-camel-case: true cache-enabled: false logging: level: org.springframework.web: info org.apache.http: info us.codecraft.webmagic: info 复制代码
编写POJO
package com.hg.spider.entity;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
@Data
@TableName("job_info")
@Slf4j
public class JobInfo {
@TableId
private Long id;
/**
* 公司名
*/
private String companyName;
/**
* 公司地址
*/
private String companyAddr;
/**
* 工作名称
*/
private String jobName;
/**
* 工作地址
*/
private String jobAddr;
/**
* 工作详情
*/
private String jobDetail;
/**
* 薪资
*/
private String salary;
/**
* 爬取的url
*/
private String url;
/**
* 职位发布时间
*/
private String time;
}
复制代码
编写Dao
package com.hg.spider.dao; import com.baomidou.mybatisplus.core.mapper.BaseMapper; import com.hg.spider.entity.JobInfo; /** * @Author skh * @Date 2020/3/21 16:27 * @Desc */ public interface JobInfoDao extends BaseMapper<JobInfo> { } 复制代码
编写Service
package com.hg.spider.service; import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper; import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; import com.hg.spider.dao.JobInfoDao; import com.hg.spider.entity.JobInfo; import com.hg.spider.webmagic.JobProcessor; import com.hg.spider.webmagic.MysqlPipeline; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import us.codecraft.webmagic.Spider; import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover; import us.codecraft.webmagic.scheduler.QueueScheduler; import java.util.List; /** * @Author skh * @Date 2020/3/21 12:10 * @Desc */ @Service @Slf4j public class JobInfoService extends ServiceImpl<JobInfoDao, JobInfo> { //开始爬取的url String url = "https://search.51job.com/list/080200,000000,0000,26,9,99,%25E6%2588%25BF%25E4%25BA%25A7%25E7%25BB%258F%25E7%25BA%25AA%25E4%25BA%25BA,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="; @Autowired private MysqlPipeline mysqlPipeline; @Autowired private JobProcessor jobProcessor; public void getJobInfo() { log.info("开始爬取数据"); //设置爬虫配置 Spider.create(jobProcessor) .addUrl(url) //设置初始爬取的url //使用布隆过滤器过滤重复url,需要引入guava包 .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000))) .thread(50) //设置线程数 .addPipeline(mysqlPipeline) //设置持久化 .run(); } public List<JobInfo> selectJobInfoByUrl(String url) { QueryWrapper<JobInfo> wrapper = new QueryWrapper<>(); wrapper.eq("url", url); List<JobInfo> jobInfos = this.baseMapper.selectList(wrapper); return jobInfos; } } 复制代码
编写Controller
package com.hg.spider.controller; import com.hg.spider.service.JobInfoService; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.RestController; /** * @Author skh * @Date 2020/3/21 12:24 * @Desc */ @RestController public class JobInfoController { @Autowired private JobInfoService jobInfoService; @GetMapping("/getJobInfo") public String getJobInfo() { jobInfoService.getJobInfo(); return "success"; } } 复制代码
实现PageProcessor,定义页面解析逻辑
package com.hg.spider.webmagic; import cn.hutool.core.collection.CollUtil; import cn.hutool.core.util.StrUtil; import com.hg.spider.entity.JobInfo; import com.hg.spider.service.JobInfoService; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Html; import us.codecraft.webmagic.selector.Selectable; import java.util.List; /** * @Author skh * @Date 2020/3/20 22:56 * @Desc 解析页面 */ @Component @Slf4j public class JobProcessor implements PageProcessor { @Autowired private JobInfoService jobInfoService; /** * 解析页面 * @param page */ @Override public void process(Page page) { //解析列表页 List<Selectable> nodes = page.getHtml().css("div#resultList div.el").nodes(); if (CollUtil.isEmpty(nodes)) { //为空表示这是招聘详情页,解析页面,获取招聘详情信息,保存数据 try { this.saveJobInfo(page); } catch (Exception e) { log.error("解析异常,异常原因:{}", e.getMessage(),e); } } else { //不为空表示这是列表页,解析出详情页url,放到任务队列中 for (Selectable node : nodes) { //获取url地址 String jobInfoUrl = node.css("p.t1 span a").links().toString(); if (StrUtil.isNotBlank(jobInfoUrl)) { //判断记录是否已存在 List<JobInfo> jobInfoList = jobInfoService.selectJobInfoByUrl(jobInfoUrl); if (CollUtil.isEmpty(jobInfoList)) { //把url放到任务队列中 page.addTargetRequest(jobInfoUrl); } else { log.info("记录已存在,记录url:{}",jobInfoUrl); } } } //获取下一页的url List<String> all = page.getHtml().css("div.p_in li.bk a").links().all(); String bkUrl = all.get(all.size() - 1); log.info("下一页Url:{}", bkUrl); if (StrUtil.containsAny(bkUrl, "11.html")) { System.out.println("已查到10页数据,无须无限爬取数据"); return; } page.addTargetRequest(bkUrl); } } /** * 解析job详情页 * @param page */ private void saveJobInfo(Page page) { //解析页面 Html html = page.getHtml(); String companyName = html.css("div.cn p.cname a", "text").get(); List<String> text = html.css("div.bmsg.inbox p.fp", "text").all(); String companyAddr = text.get(text.size() - 1); String jobName = html.css("div.cn h1", "text").get(); String jobStr = html.css("p.msg.ltype", "text").get(); String[] s = StrUtil.split(jobStr, " "); String jobAddr = s[0]; String time = ""; for (String s1 : s) { if (StrUtil.containsAny(s1, "发布")) { time = StrUtil.removeAll(s1, "发布"); break; } } String jonDetail = html.css("div.bmsg.job_msg.inbox", "allText").get(); String url = page.getUrl().get(); String salary = html.css("div.in div.cn strong", "text").get(); JobInfo jobInfo = new JobInfo(); jobInfo.setJobName(jobName); jobInfo.setJobAddr(jobAddr); jobInfo.setJobDetail(jonDetail); jobInfo.setSalary(salary); jobInfo.setUrl(url); jobInfo.setTime(time); jobInfo.setCompanyName(companyName); jobInfo.setCompanyAddr(companyAddr); //把结果保存到resultItems,为了持久化 page.putField("jobInfo", jobInfo); } //配置爬虫信息 private Site site = Site.me() .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36") .setCharset("gbk") .setTimeOut(10 * 1000) .setRetryTimes(3) .setRetrySleepTime(3000); @Override public Site getSite() { return site; } } 复制代码
实现PipeLine,保存到数据库
package com.hg.spider.webmagic;
import com.hg.spider.entity.JobInfo;
import com.hg.spider.service.JobInfoService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
/**
* @Author skh
* @Date 2020/3/21 16:18
* @Desc
*/
@Component
@Slf4j
public class MysqlPipeline implements Pipeline
{
@Autowired
private JobInfoService jobInfoService;
@Override
public void process(ResultItems resultItems, Task task) {
//获取封装好的数据
JobInfo jobInfo = resultItems.get("jobInfo");
if (jobInfo != null) {
jobInfoService.save(jobInfo);
}
}
}
复制代码
原文
https://juejin.im/post/5e785460f265da57424bb2de
本站部分文章源于互联网,本着传播知识、有益学习和研究的目的进行的转载,为网友免费提供。如有著作权人或出版方提出异议,本站将立即删除。如果您对文章转载有任何疑问请告之我们,以便我们及时纠正。PS:推荐一个微信公众号: askHarries 或者qq群:474807195,里面会分享一些资深架构师录制的视频录像:有Spring,MyBatis,Netty源码分析,高并发、高性能、分布式、微服务架构的原理,JVM性能优化这些成为架构师必备的知识体系。还能领取免费的学习资源,目前受益良多

转载请注明原文出处:Harries Blog™ » 爬虫实战-使用Springboot+WebMagic爬取51job数据