Java 版的爬虫 爬取 CSDN 搜索韩顺平的文章
这个是爬虫下的结果,结果为 txt 可以使用 poi 生成到 excel 文件 具体我就不讲解了
万物皆可爬(Java 万能的)
本文项目使用 Spring Boot 搭建 普通 Maven 项目亦可!!!
先看一下结果吧(不爬取广告栏):
这个是 CSDN 官网的数据:
主要使用的是 jsoup
进行爬虫,英语比较好的话建议还是看下官方的文档 = =
这个是项目搭建结构
下载工具,包含各种类型的请求 Client.java
package com.niu.req.downloader; import org.apache.http.NameValuePair; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.*; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.message.BasicNameValuePair; import org.apache.http.util.EntityUtils; import org.springframework.util.StringUtils; import java.io.UnsupportedEncodingException; import java.util.*; /** * @Description 下载工具,包含各种类型的请求 */ public class Client { // 编码格式。发送编码格式统一用UTF-8 private static final String ENCODING = "UTF-8"; // 设置连接超时时间,单位毫秒。 private static final int CONNECT_TIMEOUT = 60000; // 请求获取数据的超时时间(即响应时间),单位毫秒。 private static final int SOCKET_TIMEOUT = 60000; public static Page doRequest(Request request) throws Exception { HttpRequestBase http = null; switch (request.getMethod()) { case POST: http = new HttpPost(request.getUrl()); break; case GET: default: http = new HttpGet(request.getUrl()); break; } RequestConfig requestConfig = RequestConfig.custom() .setConnectTimeout(CONNECT_TIMEOUT) .setSocketTimeout(SOCKET_TIMEOUT).build(); http.setConfig(requestConfig); if (request.getHeaders() != null && !request.getHeaders().isEmpty()) { packageHeader(request.getHeaders(), http); } if (http instanceof HttpPost && request.getParams() != null) { packageParam(request.getParams(), (HttpPost) http); } Page page = Page.build(request); // 执行请求并获得响应结果 try (final CloseableHttpClient httpClient = HttpClients.custom().build(); final CloseableHttpResponse httpResponse = httpClient.execute(http) ) { if (httpResponse != null && !StringUtils.isEmpty(httpResponse.getStatusLine()) && httpResponse.getEntity() != null ) { String content = EntityUtils.toString(httpResponse.getEntity(), ENCODING); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setRaw(content); } } return page; } /** * Description: 封装请求头 * * @param params * @param httpMethod */ private static void packageHeader(final Map<String, String> params, final HttpRequestBase httpMethod) { if (params == null || params.isEmpty()) { return; } params.entrySet().stream().forEach(e -> { httpMethod.setHeader(e.getKey(), e.getValue()); }); } /** * Description: 封装请求参数 * * @param params * @param httpMethod * @throws UnsupportedEncodingException */ private static void packageParam(final Map<String, String> params, final HttpEntityEnclosingRequestBase httpMethod) throws UnsupportedEncodingException { if (params == null || params.isEmpty()) { return; } final List<NameValuePair> nvps = new ArrayList<NameValuePair>(); params.entrySet().stream().forEach(e -> { nvps.add(new BasicNameValuePair(e.getKey(), e.getValue())); }); httpMethod.setEntity(new UrlEncodedFormEntity(nvps, ENCODING)); } }
Constant.java
Http 请求的一些常用参数
package com.niu.req.downloader; import lombok.AllArgsConstructor; import lombok.Getter; /** * @Description http 请求的一些常用参数 */ public class Constant { @Getter @AllArgsConstructor public enum Method { GET("GET"),POST("POST"); String code; } @Getter @AllArgsConstructor public enum StatusCode { CODE_200(200),CODE_404(404),CODE_503(503),CODE_500(500); Integer code; } @Getter @AllArgsConstructor public enum Header { REFERER("Referer"),USER_AGENT("User-Agent"); String code; } }
Page.java
页面实体
package com.niu.req.downloader; import lombok.Getter; import lombok.Setter; /** * @Description TODO */ @Getter @Setter public class Page { private Request request; private Integer statusCode = Constant.StatusCode.CODE_500.getCode(); private String raw; private Page(){ } public static Page build(Request request){ Page page = new Page(); page.setRequest(request); return page; } }
Request.java
封装请求实体类
package com.niu.req.downloader; import lombok.Getter; import lombok.Setter; import java.util.Map; /** * @Description TODO */ @Setter @Getter public class Request { public Request(String url){ this.url = url; } private Constant.Method method = Constant.Method.GET; private String url; private Map<String,String> headers; private Map<String,String> params; }
Request.java
测试类
package com.niu.req.course01; import com.niu.req.downloader.Client; import com.niu.req.downloader.Page; import com.niu.req.downloader.Request; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * @Description 主要爬CSDN 搜索韩顺平后页面的所有文章 */ public class RequestTest { public static void main(String[] args) throws Exception { Request request = new Request("https://so.csdn.net/so/search/s.do?q=%E9%9F%A9%E9%A1%BA%E5%B9%B3&t=%20&u="); // url 路径 Page page = Client.doRequest(request); Document parse = Jsoup.parse(page.getRaw()); //得到Html文本 Elements select = parse.select("div.search-list-con"); //获取要得到的节点 select.forEach( e -> { Elements data = e.select("dl.search-list.J_search"); // 再找到节点下的节点 if (data == null || data.isEmpty()) { return; } for (Element datum : data) { System.out.println(datum.text());// 得到text } }); } }
搭建结束 运行 Request.java 即可
欢迎来到这里!
我们正在构建一个小众社区,大家在这里相互信任,以平等 • 自由 • 奔放的价值观进行分享交流。最终,希望大家能够找到与自己志同道合的伙伴,共同成长。
注册 关于