前言
主要是为了抖音短视频:带水印视频链接、不带水印的视频链接、视频文案、作者信息
代码
如果不进行数据库相关操作,可以将
from pymysql import * # 引入数据库依赖
from pymysql.converters import escape_string # 入库字符转义
这两个依赖干掉
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
| import re import json import requests import webbrowser
from pymysql import *
from pymysql.converters import escape_string
def getDouyinUrlByShareUrl(douyinShareUrl): headers = { 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Mobile Safari/537.36 Edg/103.0.1264.62' } ree = requests.get(douyinShareUrl, headers=headers) new_url = ree.url id = re.search(r'/video/(.*?)/', new_url).group(1) url = 'https://www.iesdouyin.com/web/api/v2/aweme/iteminfo/?item_ids=' + id ree = requests.get(url, headers=headers) wm = ree.json() result = wm['item_list'][0]['video']['play_addr']['url_list'][0].replace('wm', '') print("视频无水印url:" + result) return result
def get_url(content): if len(re.findall('[a-z]+://[\S]+', content, re.I | re.M)) > 0: return re.findall('[a-z]+://[\S]+', content, re.I | re.M)[0] return None
def getRedirectUrl(url, header): response = requests.get(url, headers=header) return response.url
def getDouyinVideoInfo(douyinShareUrl): douyinLink = 'https://www.iesdouyin.com/web/api/v2/aweme/iteminfo' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/92.0.4515.107 Safari/537.36'} if douyinLink.strip() is not None: if get_url(douyinShareUrl) is not None: realUrl = getRedirectUrl(get_url(douyinShareUrl), headers) startUrl = realUrl[0:realUrl.index('?')] id = startUrl[startUrl.rindex('/') + 1:len(startUrl)] douyinParams = {'item_ids': id} if realUrl.__contains__('www.douyin.com/video'): douyinResponse = requests.get(url=douyinLink, params=douyinParams, headers=headers) body = douyinResponse.text data = json.loads(body) print("用户信息:" + data) try: videoTitle = data['item_list'][0]['desc'] videoUrl = data['item_list'][0]['video']['play_addr']['url_list'][0] realVideoUrl = f'{videoUrl}'.replace('playwm', 'play') print("视频文案:" + videoTitle) print("视频带水印url:" + videoUrl) print("视频无水印url:" + realVideoUrl) return realVideoUrl except Exception as e: print(e) if __name__ == '__main__': url = 'https://v.douyin.com/jUAfreu/' print(getDouyinUrlByShareUrl("https://v.douyin.com/jUAfreu/")) print(" ======================================================= ") print(getDouyinVideoInfo("https://v.douyin.com/jUAfreu/")) print(" ======================================================= ")
|
如果链接中包含其他文字,可以通过下面这种方式将网站的url解析出来
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
|
import re
def getUrlNoSuffix(str): urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', str) return urls
def getUrl(str): regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))" url = re.findall(regex, str) return [x[0] for x in url] if __name__ == '__main__': shareUrl = '锄禾日当午,汗滴禾下土 https://image.baidu.com/search/index/ 谁知盘中餐,粒粒皆辛苦' print("Urls: ", getUrl(shareUrl))
|