京东手机数据爬取案例
学习了HttpClient 和l Jsoup,就掌握了如何抓取数据和如何解析数据,接下来,我们做一个小练习,把京东的手机数据抓取下来。
主要目的是 HttpClient和 Jsoup 的学习。
1. 需求分析
首先访问京东,搜索手机,分析页面,我们抓取以下商品数据:商品图片、价格、标题、商品详情页。
1.1 SPU和 SKU
除了以上四个属性以外,我们发现上图中的苹果手机有四种产品,我们应该每一种都要抓取。那么这里就必须要了解 spu 利 sku 的概念。
SPU = Standard Product Unit(标准产品单位)
SPU是商品信息聚合的最小单位,是一组可复用、易检索的标准化信息的集合,该集合描述了一个产品的特性。通俗点讲,属性值、特性相同的商品就可以称为一个SPU。
例如上图中的苹果手机就是SPU,包括红色、深灰色、金色、银色。
SKU=stock keeping unit(库存量单位)
SKU即库存进出计量的单位,可以是以件、盒、托盘等为单位。SKU是物理上不可分割的最小存货单元。在使用时要根据不同业态,不同管理模式来处理。在服装、鞋类商品中使用最多最普遍。
例如上图中的苹果手机有几个款式,红色苹果手机,就是一个SKU。
2. 开发准备
2.1 数据库表分析
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| CREATE TABLE `jd_item` ( `id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT '主键id', `spu` bigint(15) DEFAULT NULL COMMENT '商品集合id', `sku` bigint(15) DEFAULT NULL COMMENT '商品最小品类单元id', `title` varchar(100) DEFAULT NULL COMMENT '商品标题', `price` bigint(10) DEFAULT NULL COMMENT '商品价格', `pic` varchar(200) DEFAULT NULL COMMENT '商品图片', `url` varchar(200) DEFAULT NULL COMMENT '商品详情地址', `created` datetime DEFAULT NULL COMMENT '创建时间', `updated` datetime DEFAULT NULL COMMENT '更新时间', PRIMARY KEY (`id`), KEY `sku` (`sku`) USING BTREE ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COMMENT='京东商品表';
|
2.2 添加依赖
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
| <?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>2.0.2.RELEASE</version> </parent>
<groupId>cn.itbuild</groupId> <artifactId>itbuild-crawler-jd</artifactId> <version>1.0-SNAPSHOT</version>
<dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency>
<dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-data-jpa</artifactId> </dependency>
<dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>8.0.11</version> </dependency>
<dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> </dependency>
<dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.3</version> </dependency>
<dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> </dependency> </dependencies> </project>
|
2.3 添加配置文件
application.properties
1 2 3 4 5 6 7 8 9
| #DB Configuration spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver spring.datasource.url=jdbc:mysql: spring.datasource.username=root spring.datasource.password=root
#JPA spring.jpa.database=MYSQL spring.jpa.show-sql=true
|
3. 代码实现
3.1 编写pojo
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
| package cn.itbuild.jd.pojo;
import javax.persistence.*; import java.util.Date;
@Entity @Table(name = "jd_item") public class Item { @Id @GeneratedValue(strategy = GenerationType.IDENTITY) private Long id;
private Long spu; private Long sku; private String title; private Double price; private String pic; private String url; private Date created; private Date updated;
public Long getId() { return id; }
public void setId(Long id) { this.id = id; }
public Long getSpu() { return spu; }
public void setSpu(Long spu) { this.spu = spu; }
public Long getSku() { return sku; }
public void setSku(Long sku) { this.sku = sku; }
public String getTitle() { return title; }
public void setTitle(String title) { this.title = title; }
public Double getPrice() { return price; }
public void setPrice(Double price) { this.price = price; }
public String getPic() { return pic; }
public void setPic(String pic) { this.pic = pic; }
public String getUrl() { return url; }
public void setUrl(String url) { this.url = url; }
public Date getCreated() { return created; }
public void setCreated(Date created) { this.created = created; }
public Date getUpdated() { return updated; }
public void setUpdated(Date updated) { this.updated = updated; } }
|
3.2 编写dao
1 2 3 4 5 6 7 8 9 10
| package cn.itbuild.jd.dao;
import cn.itbuild.jd.pojo.Item; import org.springframework.data.jpa.repository.JpaRepository; import org.springframework.stereotype.Repository;
@Repository public interface ItemDao extends JpaRepository<Item, Long> { }
|
3.3 编写Service
ItemService
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
| package cn.itbuild.jd.service;
import cn.itbuild.jd.pojo.Item;
import java.util.List;
public interface ItemService {
public void save(Item item);
public List<Item> findAll(Item item); }
|
ItemServiceImpl
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
| package cn.itbuild.jd.service.impl;
import cn.itbuild.jd.dao.ItemDao; import cn.itbuild.jd.pojo.Item; import cn.itbuild.jd.service.ItemService; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.data.domain.Example; import org.springframework.stereotype.Service;
import java.util.List;
@Service public class ItemServiceImpl implements ItemService {
@Autowired private ItemDao itemDao;
@Override public void save(Item item) { this.itemDao.save(item); }
@Override public List<Item> findAll(Item item) { Example<Item> example = Example.of(item);
List<Item> list = this.itemDao.findAll(example);
return list; } }
|
3.4 编写引导类
1 2 3 4 5 6 7 8 9 10 11 12 13
| package cn.itbuild.jd;
import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.scheduling.annotation.EnableScheduling;
@SpringBootApplication @EnableScheduling public class JdApplication { public static void main(String[] args) { SpringApplication.run(JdApplication.class, args); } }
|
3.5 封装HttpClient
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
| package cn.itbuild.jd.utils;
import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.util.EntityUtils; import org.springframework.stereotype.Component;
import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.UUID;
@Component public class HttpUtils {
private PoolingHttpClientConnectionManager cm;
public HttpUtils() { this.cm = new PoolingHttpClientConnectionManager(); this.cm.setMaxTotal(100); this.cm.setDefaultMaxPerRoute(10); }
public String doGetHtml(String url) { CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
HttpGet httpGet = new HttpGet(url);
httpGet.setConfig(getConfig());
setHeaders(httpGet);
CloseableHttpResponse response = null;
try { response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == 200) { if (response.getEntity() != null) { String content = EntityUtils.toString(response.getEntity(), "utf8"); return content; } } } catch (IOException e) { e.printStackTrace(); } finally { if (response != null) { try { response.close(); } catch (IOException e) { e.printStackTrace(); } } } return ""; }
public String doGetImage(String url) { CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.cm).build();
HttpGet httpGet = new HttpGet(url);
httpGet.setConfig(getConfig());
setHeaders(httpGet);
CloseableHttpResponse response = null;
try { response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode() == 200) { if (response.getEntity() != null) { String extName = url.substring(url.lastIndexOf(".")); String picName = UUID.randomUUID().toString() + extName; OutputStream outputStream = new FileOutputStream(new File("E:/file/gitee/crawler/jd-image/" + picName)); response.getEntity().writeTo(outputStream); return picName; } } } catch (IOException e) { e.printStackTrace(); } finally { if (response != null) { try { response.close(); } catch (IOException e) { e.printStackTrace(); } } } return ""; }
private RequestConfig getConfig() { RequestConfig config = RequestConfig.custom() .setConnectTimeout(1000) .setConnectionRequestTimeout(500) .setSocketTimeout(10000) .build(); return config; }
private void setHeaders(HttpGet httpGet) { httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"); } }
|
3.6 实现数据抓取
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
| package cn.itbuild.jd.task;
import cn.itbuild.jd.pojo.Item; import cn.itbuild.jd.service.ItemService; import cn.itbuild.jd.utils.HttpUtils; import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Component;
import java.io.IOException; import java.util.Date; import java.util.List;
@Component public class ItemTask {
@Autowired private HttpUtils httpUtils;
@Autowired private ItemService itemService;
private static final ObjectMapper MAPPER = new ObjectMapper();
@Scheduled(fixedDelay = 10*1000) public void itemTask() throws Exception{ String utl = "https://search.jd.com/search?keyword=%E6%89%8B%E6%9C%BA&wq=%E6%89%8B%E6%9C%BA&ev=559_103811%5E&s=57&click=0&page=";
for (int i = 19; i < 20; i = i + 2) { String html = httpUtils.doGetHtml(utl + i); this.parse(html); }
System.out.println("手机数据抓取完成..."); }
private void parse(String html) throws IOException { Document doc = Jsoup.parse(html);
Elements spuEles = doc.select("div#J_goodsList>ul>li");
for (Element spuEle : spuEles) { if (StringUtils.isNotEmpty(spuEle.attr("data-spu"))) { long spu = Long.parseLong(spuEle.attr("data-spu"));
Elements skuEles = spuEle.select("ul.ps-main>li.ps-item"); for (Element skuEle : skuEles) { long sku = Long.parseLong(skuEle.select("[data-sku]").first().attr("data-sku"));
Item item = new Item();
item.setSku(sku); List<Item> list = this.itemService.findAll(item); if (list.size() > 0) { continue; } item.setSpu(spu); String itemUrl = "https://item.jd.com/"+sku+".html"; item.setUrl(itemUrl);
String picUrl = "https:" + skuEle.select("img[data-sku]").first().attr("data-lazy-img"); picUrl = picUrl.replace("/n7/","/n1/"); String picName = this.httpUtils.doGetImage(picUrl); item.setPic(picName);
String priceJson = this.httpUtils.doGetHtml("https://p.3.cn/prices/mgets?skuIds=J_" + sku); double price = MAPPER.readTree(priceJson).get(0).get("p").asDouble(); item.setPrice(price);
String itemInfo = this.httpUtils.doGetHtml(item.getUrl());
String title = Jsoup.parse(itemInfo).select("div.sku-name").text(); item.setTitle(title);
item.setCreated(new Date()); item.setUpdated(item.getCreated());
this.itemService.save(item); } } }
} }
|
☆