Java 爬虫实现

本贴最后更新于 1655 天前,其中的信息可能已经时异事殊

爬虫

在大数据时代,我们需要获取大量的数据进行数据的挖掘、分析以及筛选。比当我们做一个项目的时候,需要大量的真实数据进行解析,这时候就需要去某些开放的网站进行数据的爬取。

  • 注:爬虫技术本身并不违反法律条文,但是对于抓取的数据,以及抓取之后的用途法律都有明确的规定和说明,请在合法的情况下使用爬虫技术。

项目介绍

定时任务爬取京东商城手机商品数据

项目前期准备

1、所需依赖

<!-- spring boot父项目 -->
<parent>
	<groupId>org.springframework.boot</groupId>
	<artifactId>spring-boot-starter-parent</artifactId>
    <version>2.2.6.RELEASE</version>
	<relativePath/> <!-- lookup parent from repository -->
</parent>
<!-- spring mvc-->
<dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- spring data jpa-->
<dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
 <!-- 使用数据库所需的依赖 -->
<dependency>
    <groupId>mysql</groupId>
    <artifactId>mysql-connector-java</artifactId>
</dependency>
<!-- httpclient抓取网页信息所需依赖 -->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
</dependency>
 <!-- 加入jsoup依赖,用于解析页面文件 -->
 <dependency>
     <groupId>org.jsoup</groupId>
     <artifactId>jsoup</artifactId>
     <version>1.11.3</version>
</dependency>
<!-- 加入commons-lang3,使用其中的工具类,帮助处理字符串 -->
<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-lang3</artifactId>
</dependency>
<!--导入配置文件处理器,配置文件进行绑定就会有提示-->
<dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-configuration-processor</artifactId>
    <optional>true</optional>
</dependency>
<!-- 导入lombok减少代码冗余 -->
<dependency>
    <groupId>org.projectlombok</groupId>
    <artifactId>lombok</artifactId>
    <version>1.16.14</version>
</dependency>
<!--导入配置文件处理器,配置文件进行绑定就会有提示-->
<dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-configuration-processor</artifactId>
    <optional>true</optional>
</dependency>
<!-- junit单元测试类 -->
<dependency>
    <groupId>junit</groupId>
    <artifactId>junit</artifactId>
</dependency>

2、项目配置文件

#数据库
spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver
spring.datasource.url=jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=utf8&useSSL=false&serverTimezone=Asia/Shanghai
spring.datasource.username=root
spring.datasource.password=123456

#jpa
spring.jpa.database=MySQL
spring.jpa.show-sql=true

#日志处理
logging.file.path=springboot.log
logging.pattern.file=%d{yyyy-MM-dd HH:mm:ss} : [%thread] : %-5level : %logger{50} : %msg%n
logging.pattern.console=%d{yyyy-MM-dd HH:mm:ss} : [%thread] : %-5level : %logger{50} : %msg%n

3、数据库准备

数据库建表语句


Create Table

CREATE TABLE `jd_item` (
  `id` bigint NOT NULL AUTO_INCREMENT COMMENT '主键id',
  `spu` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '商品集合id',
  `sku` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL COMMENT '商品最小品类单元id',
  `title` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL COMMENT '商品标题',
  `price` bigint DEFAULT NULL COMMENT '商品价格',
  `pic` varchar(200) DEFAULT NULL COMMENT '商品图片',
  `url` varchar(200) DEFAULT NULL COMMENT '商品详情地址',
  `created` datetime DEFAULT NULL COMMENT '创建时间',
  `updated` datetime DEFAULT NULL COMMENT '更新时间',
  PRIMARY KEY (`id`,`spu`),
  KEY `sku` (`sku`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=20 DEFAULT CHARSET=utf8 COMMENT='京东商品表'


代码编写

项目启动的引导类

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;


@SpringBootApplication
@EnableScheduling
public class Application {
    public static void main(String[] args) {
        SpringApplication.run(Application.class,args);
    }
}

传输数据的 POJO


import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

import javax.persistence.*;
import java.util.Date;

@Entity
@Table(name = "jd_item")
@Data
@NoArgsConstructor
@AllArgsConstructor
public class Item {
    //主键
    @Id
    @GeneratedValue(strategy = GenerationType.IDENTITY)
    //数据库主键id
    private Long id;
    //基本商品集合单元(例如:iPhone X)
    private String spu;
    //最小单位商品(例如:iPhone X 银白色)
    private Long sku;
    //商品标题
    private String title;
    //商品价格
    private Double price;
    //下载后的商品图片名称
    private String pic;
    //商品详情地址
    private String url;
    //创建时间
    private Date created;
    //更新时间
    private Date updated;
}

Dao 层

import org.nynu.Crawler_JD.entity.Item;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.stereotype.Repository;

@Repository
public interface ItemDao extends JpaRepository<Item,Long> {
}

Service 层

import org.nynu.Crawler_JD.entity.Item;

import java.util.List;

public interface ItemService {
    public void Save (Item item);
    public List<Item> FindAll(Item item);
}

import org.nynu.Crawler_JD.dao.ItemDao;
import org.nynu.Crawler_JD.entity.Item;
import org.nynu.Crawler_JD.service.ItemService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.annotation.Transient;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;

import java.util.List;


@Service
public class ItemServiceImpl implements ItemService {

    @Autowired
    private ItemDao itemDao;

    //保存商品信息
    @Override
    //添加事务功能
    @Transactional
    public void Save(Item item) {
        this.itemDao.save(item);
    }

    //查询商品信息
    @Override
    public List<Item> FindAll(Item item) {
        Example<Item> example = Example.of(item);
        List<Item> list = this.itemDao.findAll(example);
        return list;
    }
}

封装 Http Client

package org.nynu.Crawler_JD.utils;

import org.apache.commons.lang3.RandomStringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;

@Component
public class HttpUtils {
    //创建一个连接池管理对象,管理请求
    private PoolingHttpClientConnectionManager poolManager;

    public HttpUtils() {
        this.poolManager = new PoolingHttpClientConnectionManager();
        //配置连接池管理对象
        //设置最大连接数
        this.poolManager.setMaxTotal(100);
        //设置每个主机的最大连接数
        this.poolManager.setDefaultMaxPerRoute(10);
    }

    //通过Get请求请求页面
    public String doGetHtml(String url){
        //创建HttpClient对象
        HttpClient httpClient = HttpClients.custom().setConnectionManager(poolManager).build();

        //设置请求和请求地址
        HttpGet httpGet = new HttpGet(url);
        //配置请求信息
        RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)//创建连接的最长时间 ms
                //获取连接的最长时间 ms
                .setConnectionRequestTimeout(500)
                //数据传输的最长时间 ms
                .setSocketTimeout(10 * 1000).build();
        //设置请求信息
        httpGet.setConfig(config);

        // 浏览器表示
        httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.6)");
        // 传输的类型
        httpGet.addHeader("Content-Type", "application/x-www-form-urlencoded");
        HttpResponse response = null;
        String str = "";
        try {
            //获取请求结果并返回页面数据
            response = httpClient.execute(httpGet);
            str = JudgeResponse(response);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return str;
    }

    //下载页面的图片
    public String doGetImg(String url){
        //创建HttpClient对象
        HttpClient httpClient = HttpClients.custom().setConnectionManager(poolManager).build();

        //设置请求和请求地址
        HttpGet httpGet = new HttpGet(url);

        //配置请求信息
        RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)//创建连接的最长时间 ms
                //获取连接的最长时间 ms
                .setConnectionRequestTimeout(500)
                //数据传输的最长时间 ms
                .setSocketTimeout(10 * 1000).build();
        //设置请求信息
        httpGet.setConfig(config);

        // 浏览器表示
        httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.6)");
        // 传输的类型
        httpGet.addHeader("Content-Type", "application/x-www-form-urlencoded");
        HttpResponse response = null;
        String str = "";
        try {
            //获取请求结果并返回页面数据
            response = httpClient.execute(httpGet);
            if (response.getStatusLine().getStatusCode() == 200){
                HttpEntity entity = response.getEntity();
                //下载图片

                //获取图片后缀
                String suffixName = url.substring(url.lastIndexOf("."));

                //重命名图片
                String pictureName = RandomStringUtils.randomAlphanumeric(4)+suffixName;

                //声明输出流
                OutputStream picture = new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\images\\"+pictureName));

                //下载图片
                response.getEntity().writeTo(picture);
                //返回图片名称
                str = pictureName;
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return str;
    }

    //判断请求状态码
    private static String JudgeResponse (HttpResponse resp) throws IOException {
        String page ="";
        //判断得到请求状态码是否正确
        if (resp.getStatusLine().getStatusCode() == 200){
            HttpEntity entity = resp.getEntity();
            //将页面转换成字符串输出
            page = EntityUtils.toString(entity, "utf8");
        }
        return page;
    }
}

实现数据抓取

package org.nynu.Crawler_JD.task;

import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.SneakyThrows;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.nynu.Crawler_JD.entity.Item;
import org.nynu.Crawler_JD.service.ItemService;
import org.nynu.Crawler_JD.utils.HttpUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import java.util.Date;
import java.util.List;


@Component
public class ItemTask {

    @Autowired
    private HttpUtils httpUtils;
    @Autowired
    private ItemService itemService;
    //解析JSON数据的工具类
    private static final ObjectMapper MAPPER = new ObjectMapper();

    //当下载任务完成后间隔多少时间进行下一次任务 单位:MS
    @Scheduled(fixedDelay = 100 * 1000)
    public void RegularDownloadItem() throws Exception {
        //申明需要解析的初始地址
        String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&suggest=1.def.0.V08--12s0%2C38s0%2C97s0&wq=%E6%89%8B&s=90&click=0&page=";
        int page = 1;
        //遍历循环进行页面跳转
        for (int i = 1; i < 201; i = i + 2) {
            //通过请求获取页面
            url += i;
            String html = httpUtils.doGetHtml(url );
            System.out.println("正在抓取的页面为" + page);
            page++;
            //解析页面,获取商品数据并存储
            this.Parsing(html);
            //初始化url地址
            url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&suggest=1.def.0.V08--12s0%2C38s0%2C97s0&wq=%E6%89%8B&s=90&click=0&page=";
          
        }
        System.out.println("页面壁纸抓取完毕!");
    }

    @SneakyThrows
    private void Parsing(String html) {
        //解析HTML页面,获取Document对象
        Document document = Jsoup.parse(html);
        //获取spu集合
        Elements elementsSpu = document.select("div#J_goodsList > ul >li");
        for (Element elementSpu : elementsSpu) {
            //获取到每一个spu
            String spu = elementSpu.attr("data-spu");

            //获取sku集合
            Elements elementsSku = elementSpu.select("li.ps-item");
            for (Element elementSku : elementsSku) {
                //获取每一个sku
                long sku = Long.parseLong(elementSku.select("[data-sku]").attr("data-sku"));
                //根据sku查询商品数据
                Item item = new Item();
                item.setSku(sku);
                List<Item> list = this.itemService.FindAll(item);
                //如果商品存在,即进行下一次循环,不再保存该商品
                if (list.size() > 0) {
                    continue;
                }
                //设置商品的spu
                item.setSpu(spu);
                //设置商品的详情页链接
                String itemUrl = "https://item.jd.com/" + sku + ".html";
                item.setUrl(itemUrl);
                //设置商品图片
                String picUrl = elementSku.select("img[data-sku]").first().attr("data-lazy-img");
                //经过测试发现,图片的地址有可能为空导致程序出错故对其进行判断
                if (picUrl != null && picUrl != "" && picUrl != " ") {
                    picUrl = "https:" + picUrl;
                } else {
                //如果为空,让程序抓取固定的手机图片
                    picUrl = "https://img11.360buyimg.com/n7/jfs/t1/91783/29/15480/108758/5e71e997E22c9831a/3c89cb50050d4b5a.jpg";
                }
                String img = this.httpUtils.doGetImg(picUrl);
                item.setPic(img);
                //设置商品价格
                String pricesJSON = this.httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=" + sku);
                double Price = MAPPER.readTree(pricesJSON).get(0).get("p").asDouble();
                item.setPrice(Price);
                //设置商品名称
                String itemHtml = this.httpUtils.doGetHtml(item.getUrl());
                String title = Jsoup.parse(itemHtml).select("div.sku-name").text();
                item.setTitle(title);
                //设置商品创建时间
                item.setCreated(new Date());
                //设置商品更新时间
                item.setUpdated(item.getCreated());

                //保存商品信息到数据库之中
                this.itemService.Save(item);
            }
        }
    }
}

  • 爬虫

    网络爬虫(Spider、Crawler),是一种按照一定的规则,自动地抓取万维网信息的程序。

    106 引用 • 275 回帖

相关帖子

欢迎来到这里!

我们正在构建一个小众社区,大家在这里相互信任,以平等 • 自由 • 奔放的价值观进行分享交流。最终,希望大家能够找到与自己志同道合的伙伴,共同成长。

注册 关于
请输入回帖内容 ...