前言
这里只需要一个py文件就能实现数据采集
它区别于之前记录的方式,这里没有使用Scrapy框架,直接通过Requests提取
使用Requests,需要提前下载好第三方插件库
代码注释我已经写的挺清晰的了~~~
目标:
- 创建普通的python爬虫项目
- 爬取正确的数据
(1) 对爬取的数据进行格式转换
- 爬取的数据进行数据库存储
新建一个py文件
文件名:myCrawler.py
py代码如下
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
| import requests from lxml import etree import pymysql
def job(): url = 'http://book.zongheng.com/chapter/885037/58155562.html' data = requests.get(url) selector = etree.HTML(data.content) title_1 = selector.xpath("//div[@class='title_txtbox']/text()") title_2 = str(title_1).split("['")[-1].split("']")[0] if "章" in title_2: title = title_2[str(title_2).index("章") + 1:] else: title = title_2 if "章" in title_2: chapter = title_2[0:title_2.index("章") + 1] else: chapter = "-" content_1 = selector.xpath("//div[@class='content']//text()") content = "".join(content_1).strip() db = pymysql.connect("127.0.0.1", "root", "123456", "mypython") cursor = db.cursor() print(" ---- 数据库连接成功 ---- ") querySql = "SElECT * from fiction where chapter ='%s' " % (chapter) cursor.execute(querySql) data = cursor.fetchone() if data: print(" ---- 数据已存在 ---- ") else: insertSql = "INSERT INTO fiction(chapter, title, content) VALUES( '%s', '%s','%s' ) " % (chapter, title, content) try: cursor.execute(insertSql) db.commit() print(" ---- 新增成功 ---- ") except Exception as e: db.rollback() print(e) cursor.close()
job()
|
执行代码
本地MYSQL数据库
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
| CREATE DATABASE mypython SET NAMES utf8mb4; SET FOREIGN_KEY_CHECKS = 0;
DROP TABLE IF EXISTS `fiction`; CREATE TABLE `fiction` ( `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键id', `chapter` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '小说章节数', `title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '小说标题', `content` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci COMMENT '小说内容', PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8mb4 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic; SET FOREIGN_KEY_CHECKS = 1;
|