前言

之前两篇request文章,爬取的是文章固定标签id,唯一值
这里我通过爬取赶集网上的找房,爬了一点数据,主体上和爬小说是差不多的。

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# coding:utf-8
import requests
from lxml import etree
import pymysql

# 获取网页源代码
url = 'http://sh.ganji.com/zufang/'
req = requests.get(url)
selector = etree.HTML(req.content)
# link链接
link = selector.xpath('//*[@class="f-list-item ershoufang-list"]/dl/dd/a/@href')
# 标题
title = selector.xpath('//*[@class="f-list-item ershoufang-list"]/dl/dd/a/text()')
# 价格
price = selector.xpath('//*[@class="price"]/span[1]/text()')
# 户型
type = selector.xpath('//*[@class="f-list-item ershoufang-list"]/dl/dd[@class="dd-item size"]/span[1]/text()')
# 面积
size = selector.xpath('//*[@class="f-list-item ershoufang-list"]/dl/dd[@class="dd-item size"]/span[3]/text()')
# 朝向
orientation = selector.xpath('//*[@class="f-list-item ershoufang-list"]/dl/dd[@class="dd-item size"]/span[5]/text()')
# 地点(地点被拆分了,所以需要在入库前拼接,代码在for循环中)
place1 = selector.xpath('//*[@class="f-list-item ershoufang-list"]/dl/dd[@class="dd-item address"]/span/a[1]/text()')
place2 = selector.xpath('//*[@class="f-list-item ershoufang-list"]/dl/dd[@class="dd-item address"]/span/a[2]/span/text()')

# 连接数据库
db = pymysql.connect("127.0.0.1", "root", "123456", "mypython")
cursor = db.cursor()
print(" ---- 数据库连接成功 ---- ")
for i in range(len(title)):
# 地点拼接
place1[i] = place1[i].strip()
place2[i] = place2[i].strip()
place = place1[i] + "-" + place2[i]
# 判断数据是否存在
querySql = "SElECT * from house where link ='%s' " % (link[i])
cursor.execute(querySql)
data = cursor.fetchone()
# 不存在就新增
if data:
print(" ---- 数据已存在 ---- ")
else:
# 数据不存在就新增入库
insertSql = "INSERT INTO house(link, title, price, type, size, orientation, place) VALUES('%s', '%s','%s','%s', '%s' , '%s', '%s') " % (link[i], title[i], price[i], type[i], size[i], orientation[i], place)
try:
cursor.execute(insertSql)
db.commit() # 提交到数据库执行
print(" ---- 新增成功 ---- ")
except Exception as e:
db.rollback() # 异常回滚
print(e)
# 关闭数据库
cursor.close()

数据库结构
// 创建数据库
create database mypython;
// 使用指定数据库
use mypython;

创建MySQL数据表

1
2
3
4
5
6
7
8
9
10
11
12
CREATE TABLE `house` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
`link` longtext COMMENT '出租房链接',
`title` longtext COMMENT '标题',
`price` decimal(11,2) DEFAULT NULL COMMENT '价格',
`type` varchar(255) DEFAULT NULL COMMENT '户型',
`size` varchar(255) DEFAULT NULL COMMENT '房屋面积',
`orientation` varchar(255) DEFAULT NULL COMMENT '朝向',
`condition` varchar(255) DEFAULT NULL COMMENT '装修程度',
`place` varchar(255) DEFAULT NULL COMMENT '房子位置',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;