需求

抓取URL上带有news或者blog的网页信息，将其整个页面信息保存到文件中。

配置

在SpiderConfig类中添加配置：

/**
 * 分析页面线程数
 */
public int minerThreadNum = 2;
/**
 * URL中包含的关键字
 */
public List<String> keys;

修改application.properties(.yml)，增加新配置属性，如下图：

队列管理

SpiderQueue中增加存储队列，主要方法如下：

/**
 * 存储队列<br>
 * 存储线程从这里取数据
 */
private static volatile Queue<SpiderHtml> store = new LinkedList<SpiderHtml>();

/**
 * 添加到存储队列
 *
 * @param html 爬取页面
 */
public synchronized static void addStore(SpiderHtml html) {
    store.add(html);
}

/**
 * 存储队列出队列
 *
 * @return 爬取页面
 */
public synchronized static SpiderHtml storePoll() {
    return store.poll();
}

/**
 * 存储队列是否为空
 *
 * @return
 */
public static boolean storeIsEmpty() {
    return store.isEmpty();
    }

抓取分析任务

主要作用是将等待队列中的URL拉去出来，依次再抓取网页信息，分析URL关键字。

package mobi.huanyuan.spider.runable;

import mobi.huanyuan.spider.SpiderApplication;
import mobi.huanyuan.spider.SpiderQueue;
import mobi.huanyuan.spider.bean.SpiderHtml;
import mobi.huanyuan.spider.config.SpiderConfig;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashSet;
import java.util.List;
import java.util.Set;

/**
 * 解析页面任务.
 *
 * @author Jonathan L.(xingbing.lai@gmail.com)
 * @version 1.0.0 -- Datetime: 2020/2/18 17:11
 */
public class SpiderParseHtmlRunnable implements Runnable {
    private static final Logger logger = LoggerFactory.getLogger(SpiderParseHtmlRunnable.class);

    private SpiderConfig config;

    public SpiderParseHtmlRunnable(SpiderConfig config) {
        this.config = config;
    }

    @Override
    public void run() {
        while (!SpiderApplication.isStopping) {
            parse();
        }
    }

    private synchronized void parse() {
        SpiderHtml html = SpiderQueue.waitingMinePoll(); // 等待提取URL的分析页面出队列
        if (null == html || StringUtils.isBlank(html.getHtml())) {
            return;
        }
        //当前页面深度<爬取深度 取出当前页面全部URL
        if (html.getDepth() < config.getMaxDepth()) {
            logger.info("获取页面[{}]下所有URL。。。。。。 当前线程 [{}]", html.getUrl(), Thread.currentThread().getName());
            Set<String> urls = getAllUrl(html.getUrl());
            for (String url : urls) {
                if (null == url || url.equals("")) {
                    continue;
                }
                if (url.substring(url.length() - 1).equals("/")) {
                    url = url.substring(0, url.length() - 1);
                }

                SpiderHtml minerUrl = new SpiderHtml();
                minerUrl.setUrl(url);
                minerUrl.setDepth(html.getDepth() + 1); // 爬取深度+1
                // 判断URL列表是否包含关键字
                if (!checkKeys(url, config.getKeys())) {
                    continue;
                }
                // 添加到待访问队列，每个URL只访问一次
                SpiderQueue.addUnVisited(minerUrl);
                // 将页面URL 添加到URL队列 保证每个URL只访问一次
                SpiderQueue.addUrlSet(minerUrl.getUrl());
            }
        }
    }

    /**
     * 获取URL
     *
     * @param url URL
     * @return URL
     */
    public static Set<String> getAllUrl(String url) {
        Set<String> urls = new HashSet<>();
        try {
            Connection conn = Jsoup.connect(url);
            conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13");//模拟浏览器
            Document document = conn.timeout(5000).get();
            Elements hrefs = document.select("a[href]");
            for (Element href : hrefs) {
                urls.add(href.attr("href"));
            }
        } catch (Exception e) {
            logger.info("获取URL出现异常，异常URL[{}]", url);
            logger.info("异常信息[{}]", e.getMessage());
        }
        return urls;
    }

    /**
     * URL列表是否包含关键字
     *
     * @param key  关键字
     * @param keys URL列表
     * @return true-是;false-否
     */
    public static boolean checkKeys(String key, List<String> keys) {
        boolean flag = false;
        for (String k : keys) {
            if (key.contains(k)) {
                flag = true;
                break;
            }
        }
        return flag;
    }
}

爬虫主类修改

修改Spider的start方法，增加分析线程逻辑。

// mine
for(int i = 0; i < spiderConfig.getMinerThreadNum(); i++){
    SpiderParseHtmlRunnable parseHtmlRunnable = new SpiderParseHtmlRunnable(spiderConfig);
    threadPoolTaskExecutor.execute(parseHtmlRunnable);
}

监控线程

这里增加一个监控线程，作用就是在爬虫队列里边的数据处理完成之后，监控线程关闭线程池、停止运行程序。
在springboot的main方法中添加如下代码：

//*******************************************************
//  监控线程，检查爬虫队列里是否还有URL需要处理，如果无就结束退出
//*******************************************************
threadPoolTaskExecutor = (ThreadPoolTaskExecutor) context.getBean("threadPoolTaskExecutor");
threadPoolTaskExecutor.execute(() -> {
    while (!isStopping) {
        try {
            TimeUnit.SECONDS.sleep(5);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        if (SpiderQueue.unVisitedIsEmpty()
                && SpiderQueue.waitingMineIsEmpty()
                && SpiderQueue.storeIsEmpty()) {
            isStopping = true;
            threadPoolTaskExecutor.shutdown();
            logger.info("程序结束。。。。。。当前线程[{}]", Thread.currentThread().getName());
            long endTime = System.currentTimeMillis();
            logger.info("已经访问队列URL大小[{}]当前线程[{}]", SpiderQueue.getUrlSetSize(), Thread.currentThread().getName());
            logger.info("用时[{}ms]当前线程[{}]", endTime - starTime, Thread.currentThread().getName());

            // 停止springboot
            context.close();
            System.exit(0);
        }
    }
});