1 Star 0 Fork 0

杨谨徽/代码托管

Create your Gitee Account
Explore and code with more than 13.5 million developers,Free private repositories !:)
Sign up
文件
This repository doesn't specify license. Please pay attention to the specific project description and its upstream code dependency when using it.
Clone or Download
爬虫:爬取专利数据 6.25 KB
Copy Edit Raw Blame History
杨谨徽 authored 2021-11-20 12:53 +08:00 . add 爬虫:爬取专利数据.
import requests
import execjs
pageSize = int(input('pageSize(不要大于100):'))
pageIndex = int(input('pageIndex(尽量小点):'))
# 获取到申请头网址的html页面数据
url = 'https://www.baiten.cn/results/l/java/.html?type=l'
headers = {
'Content-Type':'application/x-www-form-urlencoded',
'charset':'UTF-8',
'Host':'www.baiten.cn',
'Referer':'https://www.baiten.cn/results/l/java/.html?type=l',
'Cookie':'UM_distinctid=17cee04232917-0ce7616d753d2c-57b1a33-144000-17cee04232a152; BSESSION=ebd591955fd30d1257027dbb9963df56f77fa57e2713e5a3; CNZZDATA1275904268=820333148-1636079972-|1637373846; JSESSIONID=2CEB334BD9F1C7AB209ABCC0E1FCD9B6; Hm_lvt_7fc44f078bf7b5e19489428c362109a3=1637371390,1637376101,1637380097,1637380316; PD=ef38206be5b44e431a74453174c7df9bf2477939ae99f267254b09e8136946e9bd45a801cd1d3486; yunsuo_session_verify=58a343a92e57e07624bb552dd150650c; Hm_lpvt_7fc44f078bf7b5e19489428c362109a3=1637382574',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
response = requests.get(url=url, headers=headers).text
str = response
# 获取到被加密的数据token
token = ''
for i in range(5568, 5648):
token = token + str[i]
# 利用execjs模块对获取到的token进行加密
ctx1 = execjs.compile("""
var keyStr = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
function encode64(input) {
var output = "";
var chr1, chr2, chr3 = "";
var enc1, enc2, enc3, enc4 = "";
var i = 0;
do {
chr1 = input.charCodeAt(i++);
chr2 = input.charCodeAt(i++);
chr3 = input.charCodeAt(i++);
enc1 = chr1 >> 2;
enc2 = ((chr1 & 3) << 4) | (chr2 >> 4);
enc3 = ((chr2 & 15) << 2) | (chr3 >> 6);
enc4 = chr3 & 63;
if (isNaN(chr2)) {
enc3 = enc4 = 64
} else {
if (isNaN(chr3)) {
enc4 = 64
}
}
output = output + keyStr.charAt(enc1) + keyStr.charAt(enc2) + keyStr.charAt(enc3) + keyStr.charAt(enc4);
chr1 = chr2 = chr3 = "";
enc1 = enc2 = enc3 = enc4 = ""
} while (i < input.length);
return output;
}
""")
ctx1.call("encode64", token)
# 获取加密网站https://www.baiten.cn/results/list/token
url1 = ''
str1 = 'https://www.baiten.cn/results/list/'
str2 = ctx1.call("encode64", token)
url1 = url1 + str1 + str2
# 爬取到被加密的网站上专利的数据
url2 = url1
data = {
'sc':'',
'q':'java',
'sort':'',
'sortField':'',
'fq':'',
'pageSize':pageSize,
'pageIndex':pageIndex,
'type':'l',
'merge':'no-merge'
}
headers = {
'Content-Length':'79',
'Content-Type':'application/x-www-form-urlencoded',
'charset':'UTF-8',
'Host':'www.baiten.cn',
'Origin':'https://www.baiten.cn',
'Referer':'https://www.baiten.cn/results/l/java/.html?type=l',
'Cookie':'UM_distinctid=17cee04232917-0ce7616d753d2c-57b1a33-144000-17cee04232a152; BSESSION=ebd591955fd30d1257027dbb9963df56f77fa57e2713e5a3; CNZZDATA1275904268=820333148-1636079972-|1637373846; JSESSIONID=2CEB334BD9F1C7AB209ABCC0E1FCD9B6; Hm_lvt_7fc44f078bf7b5e19489428c362109a3=1637371390,1637376101,1637380097,1637380316; PD=ef38206be5b44e431a74453174c7df9bf2477939ae99f267254b09e8136946e9bd45a801cd1d3486; yunsuo_session_verify=58a343a92e57e07624bb552dd150650c; Hm_lpvt_7fc44f078bf7b5e19489428c362109a3=1637383707',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'Windows',
'Sec-Fetch-Dest':'empty',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-origin',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
requests1 = requests.post(url2, headers=headers, data=data).json()
result = requests1['cubePatentSearchResponse']['documents']
patentdata = open('data.txt', 'w+')
i = 0
while i < pageSize:
result = requests1['cubePatentSearchResponse']['documents'][i]['field_values']
ti = result['ti'] # 专利名称
id = result['id'] # 申请号
pn = result['pn'] # 公开号
pa = result['pa'] # 申请(专利权)人是个列表
str_pa = ''
for m1 in pa:
str_pa = str_pa + ' ' + m1
In = result['in'] # 发明人是个列表
str_in = ''
for m2 in pa:
str_in = str_in + ' ' + m2
ad = result['ad'] # 申请日
pd = result['pd'] # 公开日
ic1 = result['ic1'] # 主分类号
aic1 = result['aic1'] # 分类号
aa = result['aa'] # 地址
co = result['co'] # 国省代码
kw = result['kw'] # 技术关键词
str_kw = ''
for m3 in kw:
str_kw = str_kw + ' ' + m3
ls1 = result['ls1'] # 专利授权情况
ab = result['ab'] # 摘要
ac = result['ac'] # 学术搜索
num = i+1
patentdata.write('第')
patentdata.write('%d' %pageIndex)
patentdata.write('页的专利产品' + '\n')
patentdata.write('第')
patentdata.write('%d' %num)
patentdata.write('个专利产品' + '\n')
patentdata.write('专利名称:' + ti + '\n')
patentdata.write('申请号:' + id + '\n')
patentdata.write('公开号:' + pn + '\n')
patentdata.write('申请(专利权)人:' + str_pa + '\n')
patentdata.write('发明人:' + str_in + '\n')
patentdata.write('申请日:' + ad + '\n')
patentdata.write('公开日:' + pd + '\n')
patentdata.write('主分类号:' + ic1 + '\n')
patentdata.write('分类号:' + aic1 + '\n')
patentdata.write('地址:' + aa + '\n')
patentdata.write('国省代码:' + co + '\n')
patentdata.write('技术关键词:' + str_kw + '\n')
patentdata.write('专利授权情况:' + ls1 + '\n')
patentdata.write('摘要:' + ab + '\n')
patentdata.write('学术搜索:' + ac + '\n')
i += 1
else:
print("数据获取完毕!")
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/SHIBATORI/code-hosting.git
git@gitee.com:SHIBATORI/code-hosting.git
SHIBATORI
code-hosting
代码托管
master

Search