SpringBoot+WebMagic+MyBaties实现爬虫和数据入库的示例分析,很多新手对此不是很清楚,为了帮助大家解决这个难题,下面小编将为大家详细讲解,有这方面需求的人可以来学习下,希望你能有所收获。
WebMagic是一个开源爬虫框架,项目通过在SpringBoot项目中使用WebMagic去抓取数据,最后使用MyBatis将数据入库。
本示例中库名为article,表名为cms_content,表中包含contentId、title、date三个字段。
CREATE TABLE `cms_content` ( `contentId` varchar(40) NOT NULL COMMENT '内容ID', `title` varchar(150) NOT NULL COMMENT '标题', `date` varchar(150) NOT NULL COMMENT '发布日期', PRIMARY KEY (`contentId`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='CMS内容表';
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>2.5.5</version> <relativePath/> </parent> <groupId>com.example</groupId> <artifactId>Article</artifactId> <version>0.0.1-SNAPSHOT</version> <name>Article</name> <description>Article</description> <properties> <java.version>1.8</java.version> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <maven.test.skip>true</maven.test.skip> <maven.compiler.plugin.version>3.8.1</maven.compiler.plugin.version> <maven.resources.plugin.version>3.1.0</maven.resources.plugin.version> <mysql.connector.version>5.1.47</mysql.connector.version> <druid.spring.boot.starter.version>1.1.17</druid.spring.boot.starter.version> <mybatis.spring.boot.starter.version>1.3.4</mybatis.spring.boot.starter.version> <fastjson.version>1.2.58</fastjson.version> <commons.lang3.version>3.9</commons.lang3.version> <joda.time.version>2.10.2</joda.time.version> <webmagic.core.version>0.7.5</webmagic.core.version> </properties> <dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-configuration-processor</artifactId> <optional>true</optional> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>${mysql.connector.version}</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>druid-spring-boot-starter</artifactId> <version>${druid.spring.boot.starter.version}</version> </dependency> <dependency> <groupId>org.mybatis.spring.boot</groupId> <artifactId>mybatis-spring-boot-starter</artifactId> <version>${mybatis.spring.boot.starter.version}</version> </dependency> <dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>${fastjson.version}</version> </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> <version>${commons.lang3.version}</version> </dependency> <dependency> <groupId>joda-time</groupId> <artifactId>joda-time</artifactId> <version>${joda.time.version}</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>${webmagic.core.version}</version> <exclusions> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> </exclusions> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>${maven.compiler.plugin.version}</version> <configuration> <source>${java.version}</source> <target>${java.version}</target> <encoding>${project.build.sourceEncoding}</encoding> </configuration> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-resources-plugin</artifactId> <version>${maven.resources.plugin.version}</version> <configuration> <encoding>${project.build.sourceEncoding}</encoding> </configuration> </plugin> <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> <configuration> <fork>true</fork> <addResources>true</addResources> </configuration> <executions> <execution> <goals> <goal>repackage</goal> </goals> </execution> </executions> </plugin> </plugins> </build> <repositories> <repository> <id>public</id> <name>aliyun nexus</name> <url>http://maven.aliyun.com/nexus/content/groups/public/</url> <releases> <enabled>true</enabled> </releases> </repository> </repositories> <pluginRepositories> <pluginRepository> <id>public</id> <name>aliyun nexus</name> <url>http://maven.aliyun.com/nexus/content/groups/public/</url> <releases> <enabled>true</enabled> </releases> <snapshots> <enabled>false</enabled> </snapshots> </pluginRepository> </pluginRepositories> </project>
数据实体,和表中3个字段对应。
package site.exciter.article.model; public class CmsContentPO { private String contentId; private String title; private String date; public String getContentId() { return contentId; } public void setContentId(String contentId) { this.contentId = contentId; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getDate() { return date; } public void setDate(String date) { this.date = date; } }
package site.exciter.article.dao; import org.apache.ibatis.annotations.Mapper; import site.exciter.article.model.CmsContentPO; @Mapper public interface CrawlerMapper { int addCmsContent(CmsContentPO record); }
在resources下新建mapper文件夹,在mapper下创建CrawlerMapper.xml
<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd"> <mapper namespace="site.exciter.article.dao.CrawlerMapper"> <insert id="addCmsContent" parameterType="site.exciter.article.model.CmsContentPO"> insert into cms_content (contentId, title, date) values (#{contentId,jdbcType=VARCHAR}, #{title,jdbcType=VARCHAR}, #{date,jdbcType=VARCHAR}) </insert> </mapper>
配置数据库和mybatis映射关系。
# mysql spring.datasource.name=mysql spring.datasource.type=com.alibaba.druid.pool.DruidDataSource spring.datasource.driver-class-name=com.mysql.jdbc.Driver spring.datasource.url=jdbc:mysql://10.201.61.184:3306/article?useUnicode=true&characterEncoding=utf8&useSSL=false&allowMultiQueries=true spring.datasource.username=root spring.datasource.password=root # druid spring.datasource.druid.initial-size=5 spring.datasource.druid.min-idle=5 spring.datasource.druid.max-active=10 spring.datasource.druid.max-wait=60000 spring.datasource.druid.validation-query=SELECT 1 FROM DUAL spring.datasource.druid.test-on-borrow=false spring.datasource.druid.test-on-return=false spring.datasource.druid.test-while-idle=true spring.datasource.druid.time-between-eviction-runs-millis=60000 spring.datasource.druid.min-evictable-idle-time-millis=300000 spring.datasource.druid.max-evictable-idle-time-millis=600000 # mybatis mybatis.mapperLocations=classpath:mapper/CrawlerMapper.xml
解析html的逻辑。
package site.exciter.article; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Selectable; @Component public class ArticlePageProcessor implements PageProcessor { private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); @Override public void process(Page page) { String detail_urls_Xpath = "//*[@class='postTitle']/a[@class='postTitle2']/@href"; String next_page_xpath = "//*[@id='nav_next_page']/a/@href"; String next_page_css = "#homepage_top_pager > div:nth-child(1) > a:nth-child(7)"; String title_xpath = "//h2[@class='postTitle']/a/span/text()"; String date_xpath = "//span[@id='post-date']/text()"; page.putField("title", page.getHtml().xpath(title_xpath).toString()); if (page.getResultItems().get("title") == null) { page.setSkip(true); } page.putField("date", page.getHtml().xpath(date_xpath).toString()); if (page.getHtml().xpath(detail_urls_Xpath).match()) { Selectable detailUrls = page.getHtml().xpath(detail_urls_Xpath); page.addTargetRequests(detailUrls.all()); } if (page.getHtml().xpath(next_page_xpath).match()) { Selectable nextPageUrl = page.getHtml().xpath(next_page_xpath); page.addTargetRequests(nextPageUrl.all()); } else if (page.getHtml().css(next_page_css).match()) { Selectable nextPageUrl = page.getHtml().css(next_page_css).links(); page.addTargetRequests(nextPageUrl.all()); } } @Override public Site getSite() { return site; } }
处理数据的持久化。
package site.exciter.article; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import site.exciter.article.model.CmsContentPO; import site.exciter.article.dao.CrawlerMapper; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import java.util.UUID; @Component public class ArticlePipeline implements Pipeline { private static final Logger LOGGER = LoggerFactory.getLogger(ArticlePipeline.class); @Autowired private CrawlerMapper crawlerMapper; public void process(ResultItems resultItems, Task task) { String title = resultItems.get("title"); String date = resultItems.get("date"); CmsContentPO contentPO = new CmsContentPO(); contentPO.setContentId(UUID.randomUUID().toString()); contentPO.setTitle(title); contentPO.setDate(date); try { boolean success = crawlerMapper.addCmsContent(contentPO) > 0; LOGGER.info("保存成功:{}", title); } catch (Exception ex) { LOGGER.error("保存失败", ex); } } }
执行抓取任务。
package site.exciter.article; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import us.codecraft.webmagic.Spider; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; @Component public class ArticleTask { private static final Logger LOGGER = LoggerFactory.getLogger(ArticlePipeline.class); @Autowired private ArticlePipeline articlePipeline; @Autowired private ArticlePageProcessor articlePageProcessor; private ScheduledExecutorService timer = Executors.newSingleThreadScheduledExecutor(); public void crawl() { // 定时任务,每10分钟爬取一次 timer.scheduleWithFixedDelay(() -> { Thread.currentThread().setName("ArticleCrawlerThread"); try { Spider.create(articlePageProcessor) .addUrl("http://www.cnblogs.com/dick159/default.html?page=2") // 抓取到的数据存数据库 .addPipeline(articlePipeline) // 开启5个线程抓取 .thread(5) // 异步启动爬虫 .start(); } catch (Exception ex) { LOGGER.error("定时抓取数据线程执行异常", ex); } }, 0, 10, TimeUnit.MINUTES); } }
package site.exciter.article; import org.mybatis.spring.annotation.MapperScan; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.CommandLineRunner; import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; @SpringBootApplication @MapperScan(basePackages = "site.exciter.article.interface") public class ArticleApplication implements CommandLineRunner { @Autowired private ArticleTask articleTask; public static void main(String[] args) { SpringApplication.run(ArticleApplication.class, args); } @Override public void run(String... args) throws Exception { articleTask.crawl(); } }
看完上述内容是否对您有帮助呢?如果还想对相关知识有进一步的了解或阅读更多相关文章,请关注亿速云行业资讯频道,感谢您对亿速云的支持。
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。