1 Star 5 Fork 0

坚持不懈的大白 / Python爬虫

Create your Gitee Account
Explore and code with more than 12 million developers,Free private repositories !:)
Sign up
Clone or Download
ips2.py 6.29 KB
Copy Edit Raw Blame History
坚持不懈的大白 authored 2021-05-27 22:16 . add ips2.py.
from crawlers.userAgent import useragent
import requests
from lxml import etree
import random
import time
import threading
class IPs(object):
def __init__(self):
self.url1='http://www.nimadaili.com/gaoni/%d/' # ip网址1 最大页数为2000 1-350
self.url2='https://www.89ip.cn/index_%d.html' # ip网址2 最大页数为1-110
self.url3='https://www.kuaidaili.com/free/inha/%d/' # ip网址3 最大页数1-4000
self.proxies=[] # 存储所有的ip
self.userfulProxies=[] # 存储有效的ip
self.url='https://www.baidu.com/' # 用来检测ip是否可用的网址,这里用百度
self.userAgent=useragent()
def checkIps(self,ips):
while True:
if len(ips)==0:
break
proxies=ips.pop()
headers = {'user-agent': self.userAgent.getUserAgent()}
try:
rsp=requests.get(url=self.url,headers=headers,proxies=proxies,timeout=0.5) # 设置超时时间
if(rsp.status_code==200):
self.userfulProxies.append(proxies)
# print('========IP{}可用'.format(proxies)) # 测试需要的话可以不注释掉
time.sleep(1) # 休眠1秒钟
except Exception as e:
pass
# print(e)
# print('========IP{}不可用'.format(proxies)) # 用于测试,可用注销掉的
def getUserIps(self): # 得到可用的ip数据
self.spiderIps()
ips=self.proxies[:]
# 爬取的ip总数上百,使用10个线程
threads=[]
print('=====Start testing!')
for i in range(10):
thread=threading.Thread(target=self.checkIps,args=(ips,))
thread.start()
threads.append(thread)
for th in threads:
th.join()
print('IP test completed!') # ip测试完毕!
print('(The number of available IPs is:[%d])' % len(self.userfulProxies)) # 总共有效的ip数目为
print('IP proxy efficiency is--{:.2f}%'.format((len(self.userfulProxies)/len(self.proxies))*100)) # 爬取的ip有效率
return self.userfulProxies # 把可用的ip数据返回
def spiderIps(self):
ipss=[] # 用来存储三个网址爬取下来的ip
userAgent = self.userAgent
print('===Initializes and is ready to start crawling IP!')
# 初始化,准备开始爬取ip
randomPage1=random.randrange(1,300)
url1 = self.url1 # 爬取网址1上的ip数据
print('=' * 52)
for i in range(randomPage1,randomPage1+10):
headers={'user-agent':userAgent.getUserAgent(),
'Referer': 'http://www.nimadaili.com/',
'Host': 'www.nimadaili.com'}
try:
response=requests.get(url=url1%(i),headers=headers,timeout=6)
if(response.status_code==200):
print("URL-1==page-%d is successful!"%(i))
html1=etree.HTML(response.text)
ips=html1.xpath("//table[@class='fl-table']/tbody/tr/td[1]/text()")
ipsType=html1.xpath("//table[@class='fl-table']/tbody/tr/td[2]/text()") # ip的类型 http https
proxies=["{}://{}".format(ipsType[j][:4].lower(),ips[j]) for j in range(len(ips))]
ipss.extend(proxies)
time.sleep(3)
# 每爬取一页休眠2秒钟
except Exception as e:
print(e)
print("URL-1==>page-%d is fail!" % (i))
url2=self.url2 # 爬取网址2上的ip数据
# 这个网址没有提到网址的请求类型http或https
randomPage2=random.randrange(1,50)
print('='*52)
for i in range(randomPage2,randomPage2+10):
headers={'user-agent':userAgent.getUserAgent()}
try:
response2=requests.get(url=url2%(i),headers=headers,timeout=5)
if(response2.status_code==200):
print("URL-2=page-%d is successful!" % (i))
HTML1=etree.HTML(response2.text)
ips1=HTML1.xpath("//table[@class='layui-table']/tbody/tr/td[1]/text()")
ports1=HTML1.xpath("//table[@class='layui-table']/tbody/tr/td[2]/text()")
n=len(ips1)
ips1=["{}:{}".format(ips1[i][4:-2],ports1[i][4:-2]) for i in range(n)]
ips11=[]
for ip1 in ips1:
ips11.append('{}://{}'.format('http',ip1))
ips11.append('{}://{}'.format('https', ip1))
ipss.extend(ips11)
time.sleep(2)
except:
print("URL-2==>page-%d is fail!" % (i))
randomPage3 = random.randrange(1, 3000)
url3 = self.url3 # 爬取网址3上的ip数据
print('=' * 52)
for i in range(randomPage3, randomPage3 + 10):
headers = {'user-agent': userAgent.getUserAgent()}
try:
response3 = requests.get(url=url3 % (i), headers=headers,timeout=5)
if response3.status_code == 200:
print("URL-3==page-%d is successful!" % (i))
HTML2 = etree.HTML(response3.text)
ips2 = HTML2.xpath("//table[@class='table table-bordered table-striped']/tbody/tr/td[1]/text()")
ports2 = HTML2.xpath("//table[@class='table table-bordered table-striped']/tbody/tr/td[2]/text()")
ipsType = HTML2.xpath("//table[@class='table table-bordered table-striped']/tbody/tr/td[4]/text()")
n2 = len(ips2)
ips2 = ["{}://{}:{}".format(ipsType[i].lower(),ips2[i], ports2[i]) for i in range(n2)]
ipss.extend(ips2)
time.sleep(2)
except:
print("URL-3==>page-%d is fail!" % (i))
# 直接使用集合方法,去掉重复的ip
ipss=list(set(ipss))
print('The total number of IP crawls is {}'.format(len(ipss))) # 爬取的ip总数为
proxiess=[{ip[:ip.find('://')]:ip} for ip in ipss]
self.proxies=proxiessip
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/il_li/pythonSpider.git
git@gitee.com:il_li/pythonSpider.git
il_li
pythonSpider
Python爬虫
b106a884035d663b3913c7069bd3921a749195d3

Search

344bd9b3 5694891 D2dac590 5694891