Java 爬虫实现

本贴最后更新于 1826 天前,其中的信息可能已经时异事殊

爬虫

在大数据时代,我们需要获取大量的数据进行数据的挖掘、分析以及筛选。比当我们做一个项目的时候,需要大量的真实数据进行解析,这时候就需要去某些开放的网站进行数据的爬取。

  • 注:爬虫技术本身并不违反法律条文,但是对于抓取的数据,以及抓取之后的用途法律都有明确的规定和说明,请在合法的情况下使用爬虫技术。

项目介绍

定时任务爬取京东商城手机商品数据

项目前期准备

1、所需依赖

<!-- spring boot父项目 --> <parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>2.2.6.RELEASE</version> <relativePath/> <!-- lookup parent from repository --> </parent> <!-- spring mvc--> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <!-- spring data jpa--> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-data-jpa</artifactId> </dependency> <!-- 使用数据库所需的依赖 --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> </dependency> <!-- httpclient抓取网页信息所需依赖 --> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> </dependency> <!-- 加入jsoup依赖,用于解析页面文件 --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency> <!-- 加入commons-lang3,使用其中的工具类,帮助处理字符串 --> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> </dependency> <!--导入配置文件处理器,配置文件进行绑定就会有提示--> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-configuration-processor</artifactId> <optional>true</optional> </dependency> <!-- 导入lombok减少代码冗余 --> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <version>1.16.14</version> </dependency> <!--导入配置文件处理器,配置文件进行绑定就会有提示--> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-configuration-processor</artifactId> <optional>true</optional> </dependency> <!-- junit单元测试类 --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> </dependency>

2、项目配置文件

#数据库 spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver spring.datasource.url=jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=utf8&useSSL=false&serverTimezone=Asia/Shanghai spring.datasource.username=root spring.datasource.password=123456 #jpa spring.jpa.database=MySQL spring.jpa.show-sql=true #日志处理 logging.file.path=springboot.log logging.pattern.file=%d{yyyy-MM-dd HH:mm:ss} : [%thread] : %-5level : %logger{50} : %msg%n logging.pattern.console=%d{yyyy-MM-dd HH:mm:ss} : [%thread] : %-5level : %logger{50} : %msg%n

3、数据库准备

数据库建表语句

Create Table CREATE TABLE `jd_item` ( `id` bigint NOT NULL AUTO_INCREMENT COMMENT '主键id', `spu` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '商品集合id', `sku` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL COMMENT '商品最小品类单元id', `title` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL COMMENT '商品标题', `price` bigint DEFAULT NULL COMMENT '商品价格', `pic` varchar(200) DEFAULT NULL COMMENT '商品图片', `url` varchar(200) DEFAULT NULL COMMENT '商品详情地址', `created` datetime DEFAULT NULL COMMENT '创建时间', `updated` datetime DEFAULT NULL COMMENT '更新时间', PRIMARY KEY (`id`,`spu`), KEY `sku` (`sku`) USING BTREE ) ENGINE=InnoDB AUTO_INCREMENT=20 DEFAULT CHARSET=utf8 COMMENT='京东商品表'

代码编写

项目启动的引导类

import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.scheduling.annotation.EnableScheduling; @SpringBootApplication @EnableScheduling public class Application { public static void main(String[] args) { SpringApplication.run(Application.class,args); } }

传输数据的 POJO

import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; import javax.persistence.*; import java.util.Date; @Entity @Table(name = "jd_item") @Data @NoArgsConstructor @AllArgsConstructor public class Item { //主键 @Id @GeneratedValue(strategy = GenerationType.IDENTITY) //数据库主键id private Long id; //基本商品集合单元(例如:iPhone X) private String spu; //最小单位商品(例如:iPhone X 银白色) private Long sku; //商品标题 private String title; //商品价格 private Double price; //下载后的商品图片名称 private String pic; //商品详情地址 private String url; //创建时间 private Date created; //更新时间 private Date updated; }

Dao 层

import org.nynu.Crawler_JD.entity.Item; import org.springframework.data.jpa.repository.JpaRepository; import org.springframework.stereotype.Repository; @Repository public interface ItemDao extends JpaRepository<Item,Long> { }

Service 层

import org.nynu.Crawler_JD.entity.Item; import java.util.List; public interface ItemService { public void Save (Item item); public List<Item> FindAll(Item item); }
import org.nynu.Crawler_JD.dao.ItemDao; import org.nynu.Crawler_JD.entity.Item; import org.nynu.Crawler_JD.service.ItemService; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.data.annotation.Transient; import org.springframework.data.domain.Example; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; import java.util.List; @Service public class ItemServiceImpl implements ItemService { @Autowired private ItemDao itemDao; //保存商品信息 @Override //添加事务功能 @Transactional public void Save(Item item) { this.itemDao.save(item); } //查询商品信息 @Override public List<Item> FindAll(Item item) { Example<Item> example = Example.of(item); List<Item> list = this.itemDao.findAll(example); return list; } }

封装 Http Client

package org.nynu.Crawler_JD.utils; import org.apache.commons.lang3.RandomStringUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.util.EntityUtils; import org.springframework.stereotype.Component; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; @Component public class HttpUtils { //创建一个连接池管理对象,管理请求 private PoolingHttpClientConnectionManager poolManager; public HttpUtils() { this.poolManager = new PoolingHttpClientConnectionManager(); //配置连接池管理对象 //设置最大连接数 this.poolManager.setMaxTotal(100); //设置每个主机的最大连接数 this.poolManager.setDefaultMaxPerRoute(10); } //通过Get请求请求页面 public String doGetHtml(String url){ //创建HttpClient对象 HttpClient httpClient = HttpClients.custom().setConnectionManager(poolManager).build(); //设置请求和请求地址 HttpGet httpGet = new HttpGet(url); //配置请求信息 RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)//创建连接的最长时间 ms //获取连接的最长时间 ms .setConnectionRequestTimeout(500) //数据传输的最长时间 ms .setSocketTimeout(10 * 1000).build(); //设置请求信息 httpGet.setConfig(config); // 浏览器表示 httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.6)"); // 传输的类型 httpGet.addHeader("Content-Type", "application/x-www-form-urlencoded"); HttpResponse response = null; String str = ""; try { //获取请求结果并返回页面数据 response = httpClient.execute(httpGet); str = JudgeResponse(response); } catch (IOException e) { e.printStackTrace(); } return str; } //下载页面的图片 public String doGetImg(String url){ //创建HttpClient对象 HttpClient httpClient = HttpClients.custom().setConnectionManager(poolManager).build(); //设置请求和请求地址 HttpGet httpGet = new HttpGet(url); //配置请求信息 RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)//创建连接的最长时间 ms //获取连接的最长时间 ms .setConnectionRequestTimeout(500) //数据传输的最长时间 ms .setSocketTimeout(10 * 1000).build(); //设置请求信息 httpGet.setConfig(config); // 浏览器表示 httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.6)"); // 传输的类型 httpGet.addHeader("Content-Type", "application/x-www-form-urlencoded"); HttpResponse response = null; String str = ""; try { //获取请求结果并返回页面数据 response = httpClient.execute(httpGet); if (response.getStatusLine().getStatusCode() == 200){ HttpEntity entity = response.getEntity(); //下载图片 //获取图片后缀 String suffixName = url.substring(url.lastIndexOf(".")); //重命名图片 String pictureName = RandomStringUtils.randomAlphanumeric(4)+suffixName; //声明输出流 OutputStream picture = new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\images\\"+pictureName)); //下载图片 response.getEntity().writeTo(picture); //返回图片名称 str = pictureName; } } catch (IOException e) { e.printStackTrace(); } return str; } //判断请求状态码 private static String JudgeResponse (HttpResponse resp) throws IOException { String page =""; //判断得到请求状态码是否正确 if (resp.getStatusLine().getStatusCode() == 200){ HttpEntity entity = resp.getEntity(); //将页面转换成字符串输出 page = EntityUtils.toString(entity, "utf8"); } return page; } }

实现数据抓取

package org.nynu.Crawler_JD.task; import com.fasterxml.jackson.databind.ObjectMapper; import lombok.SneakyThrows; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.nynu.Crawler_JD.entity.Item; import org.nynu.Crawler_JD.service.ItemService; import org.nynu.Crawler_JD.utils.HttpUtils; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Component; import java.util.Date; import java.util.List; @Component public class ItemTask { @Autowired private HttpUtils httpUtils; @Autowired private ItemService itemService; //解析JSON数据的工具类 private static final ObjectMapper MAPPER = new ObjectMapper(); //当下载任务完成后间隔多少时间进行下一次任务 单位:MS @Scheduled(fixedDelay = 100 * 1000) public void RegularDownloadItem() throws Exception { //申明需要解析的初始地址 String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&suggest=1.def.0.V08--12s0%2C38s0%2C97s0&wq=%E6%89%8B&s=90&click=0&page="; int page = 1; //遍历循环进行页面跳转 for (int i = 1; i < 201; i = i + 2) { //通过请求获取页面 url += i; String html = httpUtils.doGetHtml(url ); System.out.println("正在抓取的页面为" + page); page++; //解析页面,获取商品数据并存储 this.Parsing(html); //初始化url地址 url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&suggest=1.def.0.V08--12s0%2C38s0%2C97s0&wq=%E6%89%8B&s=90&click=0&page="; } System.out.println("页面壁纸抓取完毕!"); } @SneakyThrows private void Parsing(String html) { //解析HTML页面,获取Document对象 Document document = Jsoup.parse(html); //获取spu集合 Elements elementsSpu = document.select("div#J_goodsList > ul >li"); for (Element elementSpu : elementsSpu) { //获取到每一个spu String spu = elementSpu.attr("data-spu"); //获取sku集合 Elements elementsSku = elementSpu.select("li.ps-item"); for (Element elementSku : elementsSku) { //获取每一个sku long sku = Long.parseLong(elementSku.select("[data-sku]").attr("data-sku")); //根据sku查询商品数据 Item item = new Item(); item.setSku(sku); List<Item> list = this.itemService.FindAll(item); //如果商品存在,即进行下一次循环,不再保存该商品 if (list.size() > 0) { continue; } //设置商品的spu item.setSpu(spu); //设置商品的详情页链接 String itemUrl = "https://item.jd.com/" + sku + ".html"; item.setUrl(itemUrl); //设置商品图片 String picUrl = elementSku.select("img[data-sku]").first().attr("data-lazy-img"); //经过测试发现,图片的地址有可能为空导致程序出错故对其进行判断 if (picUrl != null && picUrl != "" && picUrl != " ") { picUrl = "https:" + picUrl; } else { //如果为空,让程序抓取固定的手机图片 picUrl = "https://img11.360buyimg.com/n7/jfs/t1/91783/29/15480/108758/5e71e997E22c9831a/3c89cb50050d4b5a.jpg"; } String img = this.httpUtils.doGetImg(picUrl); item.setPic(img); //设置商品价格 String pricesJSON = this.httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=" + sku); double Price = MAPPER.readTree(pricesJSON).get(0).get("p").asDouble(); item.setPrice(Price); //设置商品名称 String itemHtml = this.httpUtils.doGetHtml(item.getUrl()); String title = Jsoup.parse(itemHtml).select("div.sku-name").text(); item.setTitle(title); //设置商品创建时间 item.setCreated(new Date()); //设置商品更新时间 item.setUpdated(item.getCreated()); //保存商品信息到数据库之中 this.itemService.Save(item); } } } }
  • 爬虫

    网络爬虫(Spider、Crawler),是一种按照一定的规则,自动地抓取万维网信息的程序。

    106 引用 • 275 回帖

相关帖子

欢迎来到这里!

我们正在构建一个小众社区,大家在这里相互信任,以平等 • 自由 • 奔放的价值观进行分享交流。最终,希望大家能够找到与自己志同道合的伙伴,共同成长。

注册 关于
请输入回帖内容 ...