2 Star 0 Fork 0

xfeistar/datacoll

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
get_ccgp_anhui.py 8.36 KB
一键复制 编辑 原始数据 按行查看 历史
liuyang3000 提交于 2023-10-25 14:10 . 更新招标公告
import datetime
import json
import logging
import time
import requests
from bs4 import BeautifulSoup
import MDBUtil
logger = logging.getLogger()
logger.setLevel('INFO')
BASIC_FORMAT = "%(asctime)s:%(levelname)s:%(message)s"
DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
formatter = logging.Formatter(BASIC_FORMAT, DATE_FORMAT)
filename = time.strftime('%Y-%m-%d', time.localtime(time.time()))
chlr = logging.StreamHandler() # 输出到控制台的handler
chlr.setFormatter(formatter)
chlr.setLevel('INFO') # 也可以不设置,不设置就默认用logger的level
fhlr = logging.FileHandler("./log/" + filename + "_ccgp_anhui.log") # 输出到文件的handler
fhlr.setFormatter(formatter)
logger.addHandler(chlr)
logger.addHandler(fhlr)
class ccgp_anhui():
# 初始化数据
def __init__(self):
self.cookies = dict()
self.session = requests.session()
# 随机headers
self.get_headers = {
'Host': 'www.ccgp-anhui.gov.cn',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62'
}
self.post_headers = {
'Host': 'www.ccgp-anhui.gov.cn',
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate',
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62'
}
self.timeout = 30
# 更新cookie
def update_cookies(self, new_cookies):
for key in new_cookies:
self.cookies[key] = new_cookies[key]
# 请求数据(GET)
def req_get(self, url):
requests.session().cookies = requests.utils.cookiejar_from_dict(self.cookies)
resp = self.session.get(url, headers=self.get_headers, timeout=self.timeout)
self.update_cookies(requests.utils.dict_from_cookiejar(resp.cookies))
return resp
# 请求数据(POST)
def req_post(self, url, data):
cookies = dict()
session = requests.session()
requests.session().cookies = requests.utils.cookiejar_from_dict(cookies)
resp = session.post(url, data=data, headers=self.post_headers, timeout=self.timeout, cookies=cookies)
cookies = self.update_cookies(cookies)
return resp
def get_purchase_list(self, query_param_obj):
next_page_num = 1
is_last_page = False
pageNo = int(query_param_obj['pageNo'])
pageSize = int(query_param_obj['pageSize'])
next_pageNo = pageNo
while not is_last_page:
try:
query_param_obj["pageNo"] = next_pageNo
self.url = 'http://www.ccgp-anhui.gov.cn/portal/category'
query_param_json = json.dumps(query_param_obj)
response = self.req_post(self.url, query_param_json)
parse_list = []
response_code = response.status_code
if response_code == 200:
logger.info("get_purchase_list,请求成功", exc_info=True)
responseDataText = response.text
responseData = json.loads(responseDataText)
result = responseData['result']
result_data = result['data']
total = result_data['total']
result_list = result_data['data']
if total > next_pageNo * pageSize:
is_last_page = False
else:
is_last_page = True
next_pageNo = next_pageNo + 1
def get_val_from_object(obj, key):
if obj.__contains__(key):
return str(obj[key])
return ''
for one in result_list:
objid = get_val_from_object(one, 'articleId').replace('==','')
publishtime = get_val_from_object(one, 'publishDate')
publishtime = datetime.datetime.fromtimestamp(int(publishtime) / 1000.0).strftime(
'%Y-%m-%d %H:%M:%S')
cggg_content = self.get_cggg(objid)
prjName = get_val_from_object(cggg_content, 'projectName')
prjNo = get_val_from_object(cggg_content, 'projectCode')
article = get_val_from_object(cggg_content, 'title')
cggg_content_text = get_val_from_object(cggg_content, 'content_text')
cggg_content_url = get_val_from_object(cggg_content, 'cggg_content_url')
area = get_val_from_object(one, 'districtName')
purchaseType = get_val_from_object(one, 'pathName')
prjId = get_val_from_object(one, 'articleId')
p_zone_code = get_val_from_object(one, 'pZoneCode')
p_zone_name = get_val_from_object(one, 'pZoneName')
zone_code = get_val_from_object(one, 'zoneCode')
zone_name = get_val_from_object(one, 'zoneName')
purchasePerson = get_val_from_object(one, 'buyerName')
agency = get_val_from_object(one, 'agentName')
data = (objid, publishtime, prjName, prjNo, area, purchaseType, purchasePerson,
agency, article, prjId, p_zone_code, p_zone_name, zone_code, zone_name,
cggg_content_text,
cggg_content_url,'安徽省')
parse_list.append(data)
MDBUtil.insert_t_purchase_list(parse_list)
else:
logger.info("get_station_list,请求异常,返回状态[%d]" % (response_code,), exc_info=True)
print(response)
except:
logger.error("get_station_list,获取数据失败异常,页数[%d]" % (next_page_num,), exc_info=True)
"""
根据ggid,获取采购公告内容
"""
def get_cggg(self, ggid):
self.url = 'http://www.ccgp-anhui.gov.cn/CSPDREL3BvcnRhbC9kZXRhaWw/YXJ0aWNsZUlkPUl0UDRVMmhnbjI0SjlQaUl3TTNTRVE=?wzwscspd=MC4wLjAuMA=='
response = self.req_get(self.url)
ggid = ggid.replace('+','%2B')
detail_url = 'http://www.ccgp-anhui.gov.cn/portal/detail?articleId=%s' % (ggid)
content_text = ''
try:
response = self.req_get(detail_url)
response.encoding = 'utf-8'
response_code = response.status_code
cggg_content = {}
if response_code == 200:
responseDataText = response.text
responseData = json.loads(responseDataText)
result_data = responseData['result']['data']
result_data_content = responseData['result']['data']['content']
content_text = BeautifulSoup(result_data_content, "lxml").text
cggg_content['content_text'] = content_text
cggg_content['districtCode'] = result_data['districtCode']
cggg_content['projectCode'] = result_data['projectCode']
cggg_content['projectName'] = result_data['projectName']
cggg_content['title'] = result_data['title']
cggg_content['cggg_content_url'] = self.url
logger.info("ggid[%s],请求成功" % (ggid,), exc_info=True)
except:
logger.error("ggid[%s],请求失败" % (ggid,), exc_info=True)
return cggg_content
if __name__ == '__main__':
work = ccgp_anhui()
publishDateEnd = datetime.datetime.now().strftime("%Y-%m-%d")
publishDateBegin = (datetime.datetime.now() - datetime.timedelta(days=100)).strftime('%Y-%m-%d')
query_param_obj = {
"categoryCode": "ZcyAnnouncement2",
"keyword": "环境",
"pageNo": 1,
"pageSize": "15",
'districtCode': None,
'leaf': None,
"publishDateBegin": publishDateBegin,
"publishDateEnd": publishDateEnd
}
work.get_purchase_list(query_param_obj)
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/xfeistar/datacoll.git
git@gitee.com:xfeistar/datacoll.git
xfeistar
datacoll
datacoll
master

搜索帮助