爬虫
在大数据时代,我们需要获取大量的数据进行数据的挖掘、分析以及筛选。比当我们做一个项目的时候,需要大量的真实数据进行解析,这时候就需要去某些开放的网站进行数据的爬取。
- 注:爬虫技术本身并不违反法律条文,但是对于抓取的数据,以及抓取之后的用途法律都有明确的规定和说明,请在合法的情况下使用爬虫技术。
项目介绍
定时任务爬取京东商城手机商品数据
项目前期准备
1、所需依赖
<!-- spring boot父项目 -->
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.2.6.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<!-- spring mvc-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- spring data jpa-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<!-- 使用数据库所需的依赖 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<!-- httpclient抓取网页信息所需依赖 -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<!-- 加入jsoup依赖,用于解析页面文件 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<!-- 加入commons-lang3,使用其中的工具类,帮助处理字符串 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
<!--导入配置文件处理器,配置文件进行绑定就会有提示-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-configuration-processor</artifactId>
<optional>true</optional>
</dependency>
<!-- 导入lombok减少代码冗余 -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.16.14</version>
</dependency>
<!--导入配置文件处理器,配置文件进行绑定就会有提示-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-configuration-processor</artifactId>
<optional>true</optional>
</dependency>
<!-- junit单元测试类 -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
2、项目配置文件
#数据库
spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver
spring.datasource.url=jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=utf8&useSSL=false&serverTimezone=Asia/Shanghai
spring.datasource.username=root
spring.datasource.password=123456
#jpa
spring.jpa.database=MySQL
spring.jpa.show-sql=true
#日志处理
logging.file.path=springboot.log
logging.pattern.file=%d{yyyy-MM-dd HH:mm:ss} : [%thread] : %-5level : %logger{50} : %msg%n
logging.pattern.console=%d{yyyy-MM-dd HH:mm:ss} : [%thread] : %-5level : %logger{50} : %msg%n
3、数据库准备
数据库建表语句
Create Table
CREATE TABLE `jd_item` (
`id` bigint NOT NULL AUTO_INCREMENT COMMENT '主键id',
`spu` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '商品集合id',
`sku` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL COMMENT '商品最小品类单元id',
`title` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL COMMENT '商品标题',
`price` bigint DEFAULT NULL COMMENT '商品价格',
`pic` varchar(200) DEFAULT NULL COMMENT '商品图片',
`url` varchar(200) DEFAULT NULL COMMENT '商品详情地址',
`created` datetime DEFAULT NULL COMMENT '创建时间',
`updated` datetime DEFAULT NULL COMMENT '更新时间',
PRIMARY KEY (`id`,`spu`),
KEY `sku` (`sku`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=20 DEFAULT CHARSET=utf8 COMMENT='京东商品表'
代码编写
项目启动的引导类
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableScheduling;
@SpringBootApplication
@EnableScheduling
public class Application {
public static void main(String[] args) {
SpringApplication.run(Application.class,args);
}
}
传输数据的 POJO
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import javax.persistence.*;
import java.util.Date;
@Entity
@Table(name = "jd_item")
@Data
@NoArgsConstructor
@AllArgsConstructor
public class Item {
//主键
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
//数据库主键id
private Long id;
//基本商品集合单元(例如:iPhone X)
private String spu;
//最小单位商品(例如:iPhone X 银白色)
private Long sku;
//商品标题
private String title;
//商品价格
private Double price;
//下载后的商品图片名称
private String pic;
//商品详情地址
private String url;
//创建时间
private Date created;
//更新时间
private Date updated;
}
Dao 层
import org.nynu.Crawler_JD.entity.Item;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.stereotype.Repository;
@Repository
public interface ItemDao extends JpaRepository<Item,Long> {
}
Service 层
import org.nynu.Crawler_JD.entity.Item;
import java.util.List;
public interface ItemService {
public void Save (Item item);
public List<Item> FindAll(Item item);
}
import org.nynu.Crawler_JD.dao.ItemDao;
import org.nynu.Crawler_JD.entity.Item;
import org.nynu.Crawler_JD.service.ItemService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.annotation.Transient;
import org.springframework.data.domain.Example;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.List;
@Service
public class ItemServiceImpl implements ItemService {
@Autowired
private ItemDao itemDao;
//保存商品信息
@Override
//添加事务功能
@Transactional
public void Save(Item item) {
this.itemDao.save(item);
}
//查询商品信息
@Override
public List<Item> FindAll(Item item) {
Example<Item> example = Example.of(item);
List<Item> list = this.itemDao.findAll(example);
return list;
}
}
封装 Http Client
package org.nynu.Crawler_JD.utils;
import org.apache.commons.lang3.RandomStringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
@Component
public class HttpUtils {
//创建一个连接池管理对象,管理请求
private PoolingHttpClientConnectionManager poolManager;
public HttpUtils() {
this.poolManager = new PoolingHttpClientConnectionManager();
//配置连接池管理对象
//设置最大连接数
this.poolManager.setMaxTotal(100);
//设置每个主机的最大连接数
this.poolManager.setDefaultMaxPerRoute(10);
}
//通过Get请求请求页面
public String doGetHtml(String url){
//创建HttpClient对象
HttpClient httpClient = HttpClients.custom().setConnectionManager(poolManager).build();
//设置请求和请求地址
HttpGet httpGet = new HttpGet(url);
//配置请求信息
RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)//创建连接的最长时间 ms
//获取连接的最长时间 ms
.setConnectionRequestTimeout(500)
//数据传输的最长时间 ms
.setSocketTimeout(10 * 1000).build();
//设置请求信息
httpGet.setConfig(config);
// 浏览器表示
httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.6)");
// 传输的类型
httpGet.addHeader("Content-Type", "application/x-www-form-urlencoded");
HttpResponse response = null;
String str = "";
try {
//获取请求结果并返回页面数据
response = httpClient.execute(httpGet);
str = JudgeResponse(response);
} catch (IOException e) {
e.printStackTrace();
}
return str;
}
//下载页面的图片
public String doGetImg(String url){
//创建HttpClient对象
HttpClient httpClient = HttpClients.custom().setConnectionManager(poolManager).build();
//设置请求和请求地址
HttpGet httpGet = new HttpGet(url);
//配置请求信息
RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)//创建连接的最长时间 ms
//获取连接的最长时间 ms
.setConnectionRequestTimeout(500)
//数据传输的最长时间 ms
.setSocketTimeout(10 * 1000).build();
//设置请求信息
httpGet.setConfig(config);
// 浏览器表示
httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.6)");
// 传输的类型
httpGet.addHeader("Content-Type", "application/x-www-form-urlencoded");
HttpResponse response = null;
String str = "";
try {
//获取请求结果并返回页面数据
response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == 200){
HttpEntity entity = response.getEntity();
//下载图片
//获取图片后缀
String suffixName = url.substring(url.lastIndexOf("."));
//重命名图片
String pictureName = RandomStringUtils.randomAlphanumeric(4)+suffixName;
//声明输出流
OutputStream picture = new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\images\\"+pictureName));
//下载图片
response.getEntity().writeTo(picture);
//返回图片名称
str = pictureName;
}
} catch (IOException e) {
e.printStackTrace();
}
return str;
}
//判断请求状态码
private static String JudgeResponse (HttpResponse resp) throws IOException {
String page ="";
//判断得到请求状态码是否正确
if (resp.getStatusLine().getStatusCode() == 200){
HttpEntity entity = resp.getEntity();
//将页面转换成字符串输出
page = EntityUtils.toString(entity, "utf8");
}
return page;
}
}
实现数据抓取
package org.nynu.Crawler_JD.task;
import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.SneakyThrows;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.nynu.Crawler_JD.entity.Item;
import org.nynu.Crawler_JD.service.ItemService;
import org.nynu.Crawler_JD.utils.HttpUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.util.Date;
import java.util.List;
@Component
public class ItemTask {
@Autowired
private HttpUtils httpUtils;
@Autowired
private ItemService itemService;
//解析JSON数据的工具类
private static final ObjectMapper MAPPER = new ObjectMapper();
//当下载任务完成后间隔多少时间进行下一次任务 单位:MS
@Scheduled(fixedDelay = 100 * 1000)
public void RegularDownloadItem() throws Exception {
//申明需要解析的初始地址
String url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&suggest=1.def.0.V08--12s0%2C38s0%2C97s0&wq=%E6%89%8B&s=90&click=0&page=";
int page = 1;
//遍历循环进行页面跳转
for (int i = 1; i < 201; i = i + 2) {
//通过请求获取页面
url += i;
String html = httpUtils.doGetHtml(url );
System.out.println("正在抓取的页面为" + page);
page++;
//解析页面,获取商品数据并存储
this.Parsing(html);
//初始化url地址
url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&suggest=1.def.0.V08--12s0%2C38s0%2C97s0&wq=%E6%89%8B&s=90&click=0&page=";
}
System.out.println("页面壁纸抓取完毕!");
}
@SneakyThrows
private void Parsing(String html) {
//解析HTML页面,获取Document对象
Document document = Jsoup.parse(html);
//获取spu集合
Elements elementsSpu = document.select("div#J_goodsList > ul >li");
for (Element elementSpu : elementsSpu) {
//获取到每一个spu
String spu = elementSpu.attr("data-spu");
//获取sku集合
Elements elementsSku = elementSpu.select("li.ps-item");
for (Element elementSku : elementsSku) {
//获取每一个sku
long sku = Long.parseLong(elementSku.select("[data-sku]").attr("data-sku"));
//根据sku查询商品数据
Item item = new Item();
item.setSku(sku);
List<Item> list = this.itemService.FindAll(item);
//如果商品存在,即进行下一次循环,不再保存该商品
if (list.size() > 0) {
continue;
}
//设置商品的spu
item.setSpu(spu);
//设置商品的详情页链接
String itemUrl = "https://item.jd.com/" + sku + ".html";
item.setUrl(itemUrl);
//设置商品图片
String picUrl = elementSku.select("img[data-sku]").first().attr("data-lazy-img");
//经过测试发现,图片的地址有可能为空导致程序出错故对其进行判断
if (picUrl != null && picUrl != "" && picUrl != " ") {
picUrl = "https:" + picUrl;
} else {
//如果为空,让程序抓取固定的手机图片
picUrl = "https://img11.360buyimg.com/n7/jfs/t1/91783/29/15480/108758/5e71e997E22c9831a/3c89cb50050d4b5a.jpg";
}
String img = this.httpUtils.doGetImg(picUrl);
item.setPic(img);
//设置商品价格
String pricesJSON = this.httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=" + sku);
double Price = MAPPER.readTree(pricesJSON).get(0).get("p").asDouble();
item.setPrice(Price);
//设置商品名称
String itemHtml = this.httpUtils.doGetHtml(item.getUrl());
String title = Jsoup.parse(itemHtml).select("div.sku-name").text();
item.setTitle(title);
//设置商品创建时间
item.setCreated(new Date());
//设置商品更新时间
item.setUpdated(item.getCreated());
//保存商品信息到数据库之中
this.itemService.Save(item);
}
}
}
}
欢迎来到这里!
我们正在构建一个小众社区,大家在这里相互信任,以平等 • 自由 • 奔放的价值观进行分享交流。最终,希望大家能够找到与自己志同道合的伙伴,共同成长。
注册 关于