为了简化mysql数据库操作,这里引入了mybatis和A.CTable框架。
MyBatis框架这里就不说了,很有名气的持久层框架。
A.CTable是一个基于Spring和Mybatis的Maven项目,增强了Mybatis的功能,通过配置model注解的方式来创建表,修改表结构,提供通用的单表CUDR工具,目前仅支持Mysql

引入依赖包

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
<!-- MySQL -->
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>${mybatis.boot.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid-spring-boot-starter</artifactId>
<version>${druid.version}</version>
</dependency>
<dependency>
<groupId>com.gitee.sunchenbin.mybatis.actable</groupId>
<artifactId>mybatis-enhance-actable</artifactId>
<version>${mybatis.enhance.actable.version}</version>
</dependency>

依赖包需要的版本

1
2
3
<mybatis.boot.version>1.3.2</mybatis.boot.version>
<druid.version>1.1.14</druid.version>
<mybatis.enhance.actable.version>1.0.8.1.RELEASE</mybatis.enhance.actable.version>

SpringBoot配置

首先在MySQL数据库中添加库hy_spider,然后调整SpringBoot配置,增加如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
datasource:
type: com.alibaba.druid.pool.DruidDataSource
driverClassName: com.mysql.cj.jdbc.Driver
druid:
url: jdbc:mysql://127.0.0.1:3306/hy_spider?useUnicode=true&characterEncoding=utf8&serverTimezone=GMT%2B8
username: root
password: root

# MyBatis
mybatis:
# 搜索指定包别名
typeAliasesPackage: mobi.huanyuan.spider
# 配置mapper的扫描,找到所有的mapper.xml映射文件
mapperLocations: classpath*:mapper/*Mapper.xml,classpath*:com/gitee/sunchenbin/mybatis/actable/mapping/*/*.xml
# 加载全局的配置文件
configLocation: classpath:mybatis/mybatis-config.xml
# 以下是A.C.Table框架自动创建、修改表的配置
table:
# create: 系统启动后,会将所有的表删除掉,然后根据model中配置的结构重新建表,该操作会破坏原有数据。
# update: 系统会自动判断哪些表是新建的,哪些字段要修改类型等,哪些字段要删除,哪些字段要新增,该操作不会破坏原有数据。
# none: 系统不做任何处理。
auto: update
model:
pack: mobi.huanyuan.spider.bean
database:
type: mysql

MyBatis配置

resources/mybatis-config.xml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE configuration
PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>

<settings>
<setting name="cacheEnabled" value="true" /> <!-- 全局映射器启用缓存 -->
<setting name="useGeneratedKeys" value="true" /> <!-- 允许 JDBC 支持自动生成主键 -->
<setting name="defaultExecutorType" value="REUSE" /> <!-- 配置默认的执行器 -->
<setting name="logImpl" value="SLF4J" /> <!-- 指定 MyBatis 所用日志的具体实现 -->
<!-- <setting name="mapUnderscoreToCamelCase" value="true"/> 驼峰式命名 -->
</settings>

</configuration>

SpringBoot启动类配置

1
2
3
4
5
6
7
8
@MapperScan(value = {
"mobi.huanyuan.spider.mapper",
"com.gitee.sunchenbin.mybatis.actable.dao.*"
})
@ComponentScan(value = {
"mobi.huanyuan.spider",
"com.gitee.sunchenbin.mybatis.actable.manager.*"
})

存储实体类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
package mobi.huanyuan.spider.bean;

import com.gitee.sunchenbin.mybatis.actable.annotation.Column;
import com.gitee.sunchenbin.mybatis.actable.annotation.Index;
import com.gitee.sunchenbin.mybatis.actable.annotation.Table;
import com.gitee.sunchenbin.mybatis.actable.command.BaseModel;
import com.gitee.sunchenbin.mybatis.actable.constants.MySqlTypeConstant;
import lombok.Data;
import lombok.EqualsAndHashCode;

import java.util.Date;

/**
* 爬虫抓取记录.
*
* @author Jonathan L.(xingbing.lai@gmail.com)
* @version 1.0.0 -- Datetime: 2020/2/20 10:48
*/
@EqualsAndHashCode(callSuper = true)
@Table(name = "SPIDER_RECORD")
@Data
public class SpiderRecord extends BaseModel {
@Column(name = "ID", type = MySqlTypeConstant.INT, length = 11, isKey = true, isAutoIncrement = true)
private Integer id;
/**
* 页面URL
*/
@Column(name = "URL", type = MySqlTypeConstant.VARCHAR)
private String url;
/**
* 页面信息
*/
@Column(name = "HTML", type = MySqlTypeConstant.LONGTEXT)
private String html;
/**
* 爬取深度
*/
@Column(name = "DEPTH", type = MySqlTypeConstant.INT, length = 11)
private int depth;
/**
* 创建时间
*/
@Column(name = "CREATE_TIME", type = MySqlTypeConstant.DATETIME)
private Date createTime;
/**
* 关键字,如果是多个,用半角逗号分隔开
*/
@Index
@Column(name = "KEY_WORDS", type = MySqlTypeConstant.VARCHAR, length = 128)
private String keys;
/**
* 爬取日期,格式:yyyyMMdd,比如:20200220
*/
@Index
@Column(name = "DAY", type = MySqlTypeConstant.INT, length = 11)
private int day;
}

数据存储

数据存储部分上一节处理了文件存储,这里又是MySQL,以后指不定还会用到其他存储方式,所以将数据存储部分通过策略模式重构一下,方便之后的扩展。

数据存储接口

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
package mobi.huanyuan.spider.runable.store;

import mobi.huanyuan.spider.bean.SpiderRecord;

/**
* 数据存储接口.
*
* @author Jonathan L.(xingbing.lai@gmail.com)
* @version 1.0.0 -- Datetime: 2020/2/20 14:40
*/
public interface DataStoreStrategy {
/**
* 存储数据
*
* @param record
*/
void store(SpiderRecord record);
}

数据存储服务工厂类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
package mobi.huanyuan.spider.runable.store;

import mobi.huanyuan.spider.BeanManager;
import mobi.huanyuan.spider.bean.SpiderRecord;
import mobi.huanyuan.spider.type.StoreType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;

import java.util.HashMap;
import java.util.Map;

/**
* 数据存储方式管理工厂.
*
* @author Jonathan L.(xingbing.lai@gmail.com)
* @version 1.0.0 -- Datetime: 2020/2/20 14:45
*/
public class DataStoreFactory {
private static Logger logger = LoggerFactory.getLogger(DataStoreFactory.class);

// 策略映射map
private static final Map<StoreType, Class<?>> providers = new HashMap<>();

// 注册可用存储策略
public static void registerProvider(StoreType storeType, Class<?> provider) {
providers.put(storeType, provider);
}

public static boolean store(StoreType storeType, SpiderRecord spiderRecord) {
Class<?> providerClazz = providers.get(storeType);
if (null == providerClazz) {
logger.warn("Store strategy is null.[StoreType={}]", storeType);
return false;
}
Object bean = BeanManager.getBean(providerClazz);
if (bean instanceof DataStoreStrategy) {
DataStoreStrategy dataStoreStrategy = (DataStoreStrategy) bean;
dataStoreStrategy.store(spiderRecord);
return true;
}
logger.error("Not Class with DataStoreStrategy: {}", providerClazz.getName());
return false;
}
}

数据存储类型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
package mobi.huanyuan.spider.type;

/**
* 存储类型.
*
* @author Jonathan L.(xingbing.lai@gmail.com)
* @version 1.0.0 -- Datetime: 2020/2/18 20:53
*/
public enum StoreType {
/**
* DB-MySQL-数据库存储
*/
MYSQL("MySQL"),
/**
* FILE-文件存储
*/
FILE("FILE");

private String type;

private StoreType(String type) {
this.type = type;
}

public String getType() {
return type;
}
}

数据存储的不同实现

文件存储方式方式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
package mobi.huanyuan.spider.runable.store;

import mobi.huanyuan.spider.bean.SpiderRecord;
import mobi.huanyuan.spider.config.SpiderConfig;
import mobi.huanyuan.spider.type.StoreType;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateFormatUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import java.io.*;
import java.nio.charset.StandardCharsets;

/**
* 文件存储实现.
*
* @author Jonathan L.(xingbing.lai@gmail.com)
* @version 1.0.0 -- Datetime: 2020/2/20 14:42
*/
@Service
public class FileDataStoreStrategy implements DataStoreStrategy {
private static Logger logger = LoggerFactory.getLogger(FileDataStoreStrategy.class);

static {
DataStoreFactory.registerProvider(StoreType.FILE, FileDataStoreStrategy.class);
}

@Autowired
private SpiderConfig config;

@Override
public void store(SpiderRecord record) {
if (null == record || StringUtils.isBlank(record.getHtml())) {
return;
}
String title = fileName(record.getUrl());
if (title == null || title.length() > 255) {
return;
}
storeHtmlToLocal(title, record.getHtml());
logger.info("保存数据文件完成,当前线程[{}]", Thread.currentThread().getName());
}

/**
* 文件名不能包含下列任何字符:<br>
* \/:*?"<>|
*
* @param title 标题
* @return 去掉文件名不能包含的字符
*/
public String fileName(String title) {
return title
.replaceAll("\\\\", "")
.replaceAll("/", "")
.replaceAll(":", "")
.replaceAll("\\*", "")
.replaceAll("\\?", "")
.replaceAll("\"", "")
.replaceAll("<", "")
.replaceAll(">", "")
.replaceAll("\\|", "");
}

/**
* 将html写入本地文件
*
* @param title 文件名
* @param content 内容
*/
private void storeHtmlToLocal(String title, String content) {
Writer writer = null;
try {
String path = config.getStoreLocalPath() + DateFormatUtils.format(System.currentTimeMillis(), "yyyyMMdd");
makeDir(path);
writer = new OutputStreamWriter(new FileOutputStream(new File(path + File.separator + title)), StandardCharsets.UTF_8);
writer.write(content);
writer.flush();
} catch (IOException e) {
logger.error(e.getMessage(), e);
} finally {
if (writer != null) {
try {
writer.close();
} catch (IOException e) {
logger.error(e.getMessage(), e);
}
}
}
}

/**
* 创建存储目录
*
* @param path 存储目录
*/
private void makeDir(String path) {
File file = new File(path);
if (!file.exists()) {
file.mkdirs();
logger.info("创建存储目录[{}]", path);
}
}
}

MySQL数据库存储方式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
package mobi.huanyuan.spider.runable.store;

import com.gitee.sunchenbin.mybatis.actable.manager.common.BaseMysqlCRUDManager;
import mobi.huanyuan.spider.bean.SpiderRecord;
import mobi.huanyuan.spider.type.StoreType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

/**
* DB MySQL 存储数据实现.
*
* @author Jonathan L.(xingbing.lai@gmail.com)
* @version 1.0.0 -- Datetime: 2020/2/20 14:43
*/
@Service
public class MySqlDataStoreStrategy implements DataStoreStrategy {
private static Logger logger = LoggerFactory.getLogger(MySqlDataStoreStrategy.class);

static {
DataStoreFactory.registerProvider(StoreType.MYSQL, MySqlDataStoreStrategy.class);
}

@Autowired
private BaseMysqlCRUDManager baseMysqlCRUDManager;

@Override
public void store(SpiderRecord record) {
baseMysqlCRUDManager.save(record);
}
}

存储任务调整

SpiderStoreRunnable.java中将存储方法修改如下:

1
2
3
4
5
6
private synchronized void store() {
SpiderRecord record = SpiderQueue.storePoll();
if (null != record) {
DataStoreFactory.store(config.getStoreType(), record);
}
}

其他

spring bean管理类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
package mobi.huanyuan.spider;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.BeansException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.stereotype.Component;

@Component
public class BeanManager implements ApplicationContextAware {

private static Logger logger = LoggerFactory.getLogger(BeanManager.class);
private static ApplicationContext context;

public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
logger.info("@@@@@@@@@@@@@@@@@@@@@@@ BeanManager init start ...");
if (BeanManager.context == null) {
BeanManager.context = applicationContext;
}
logger.info("@@@@@@@@@@@@@@@@@@@@@@@ BeanManager init finish ...");
}

/**
* 获取applicationContext
*
* @return {@link ApplicationContext}
*/
public static ApplicationContext getApplicationContext() {
return context;
}

/**
* 通过name获取 Bean.
*
* @param name bean name
* @return Spring bean
*/
public static Object getBean(String name) {
return getApplicationContext().getBean(name);
}

/**
* 通过class获取Bean.
*
* @param clazz class
* @param <T> Spring bean
* @return Spring bean
*/
public static <T> T getBean(Class<T> clazz) {
return getApplicationContext().getBean(clazz);
}

/**
* 通过name,以及Clazz返回指定的Bean
*
* @param name spring bean name
* @param clazz spring bean class
* @param <T> spring bean
* @return spring bean
*/
public static <T> T getBean(String name, Class<T> clazz) {
return getApplicationContext().getBean(name, clazz);
}

}

测试

配置调整

调整存储方式为MySQL,如下图:

代码调整

SpringBoot启动方法里边调整抓取地址:

1
2
3
4
5
Spider spider = context.getBean(Spider.class);
SpiderHtml startPage = new SpiderHtml();
startPage.setUrl("https://stackoverflow.com/questions/22000423/javafx-and-maven-nullpointerexception-location-is-required");
startPage.setDepth(0);
spider.start(startPage);

运行结果

抓取完成之后,数据库中存储结果如下图: