前言
之前两篇request文章,爬取的是文章固定标签id,唯一值
这里我通过爬取赶集网上的找房,爬了一点数据,主体上和爬小说是差不多的。
代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
| import requests from lxml import etree import pymysql
url = 'http://sh.ganji.com/zufang/' req = requests.get(url) selector = etree.HTML(req.content)
link = selector.xpath('//*[@class="f-list-item ershoufang-list"]/dl/dd/a/@href')
title = selector.xpath('//*[@class="f-list-item ershoufang-list"]/dl/dd/a/text()')
price = selector.xpath('//*[@class="price"]/span[1]/text()')
type = selector.xpath('//*[@class="f-list-item ershoufang-list"]/dl/dd[@class="dd-item size"]/span[1]/text()')
size = selector.xpath('//*[@class="f-list-item ershoufang-list"]/dl/dd[@class="dd-item size"]/span[3]/text()')
orientation = selector.xpath('//*[@class="f-list-item ershoufang-list"]/dl/dd[@class="dd-item size"]/span[5]/text()')
place1 = selector.xpath('//*[@class="f-list-item ershoufang-list"]/dl/dd[@class="dd-item address"]/span/a[1]/text()') place2 = selector.xpath('//*[@class="f-list-item ershoufang-list"]/dl/dd[@class="dd-item address"]/span/a[2]/span/text()')
db = pymysql.connect("127.0.0.1", "root", "123456", "mypython") cursor = db.cursor() print(" ---- 数据库连接成功 ---- ") for i in range(len(title)): place1[i] = place1[i].strip() place2[i] = place2[i].strip() place = place1[i] + "-" + place2[i] querySql = "SElECT * from house where link ='%s' " % (link[i]) cursor.execute(querySql) data = cursor.fetchone() if data: print(" ---- 数据已存在 ---- ") else: insertSql = "INSERT INTO house(link, title, price, type, size, orientation, place) VALUES('%s', '%s','%s','%s', '%s' , '%s', '%s') " % (link[i], title[i], price[i], type[i], size[i], orientation[i], place) try: cursor.execute(insertSql) db.commit() print(" ---- 新增成功 ---- ") except Exception as e: db.rollback() print(e)
cursor.close()
数据库结构 // 创建数据库 create database mypython; // 使用指定数据库 use mypython;
|
创建MySQL数据表
1 2 3 4 5 6 7 8 9 10 11 12
| CREATE TABLE `house` ( `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键', `link` longtext COMMENT '出租房链接', `title` longtext COMMENT '标题', `price` decimal(11,2) DEFAULT NULL COMMENT '价格', `type` varchar(255) DEFAULT NULL COMMENT '户型', `size` varchar(255) DEFAULT NULL COMMENT '房屋面积', `orientation` varchar(255) DEFAULT NULL COMMENT '朝向', `condition` varchar(255) DEFAULT NULL COMMENT '装修程度', `place` varchar(255) DEFAULT NULL COMMENT '房子位置', PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
|