1 Star 3 Fork 0

il_持之以恒_li / Python爬虫

Create your Gitee Account
Explore and code with more than 8 million developers,Free private repositories !:)
Sign up
Clone or Download
ip.py 3.07 KB
Copy Edit Web IDE Raw Blame History
from crawlers.userAgent import useragent
import requests
from lxml import etree
import random
import threading
import time
import sys
class IP(object):
def __init__(self):
self.url='https://www.kuaidaili.com/free/inha/{}/' # 爬取ip代理的网址,快代理
self.proxies_list=[]
self.url2='https://www.baidu.com' # 用百度网址用来测试ip是否可用
self.userfulIP=[] # 可用的ip
def progressBar(self): # 进度条函数
for i in range(1, 101):
sys.stdout.write('\r')
sys.stdout.write('{0}% |{1}'.format(int(i % 101), int((i % 101)/2) * '■'))
sys.stdout.flush()
time.sleep(0.125)
sys.stdout.write('\n')
def check_ip(self,ips):
while True:
if len(ips)==0:
break
try:
ip=ips.pop()
headers = {'User-Agent': useragent().getUserAgent()}
rsp=requests.get(url=self.url2,headers=headers,proxies=ip,timeout=0.2) # 设置超时时间
if(rsp.status_code==200):
self.userfulIP.append(ip)
# print('========IP{}可用'.format(ip)) # 测试需要的话可以不注释掉
time.sleep(1) # 休眠1秒钟
except Exception as e:
print(e)
def get_UsefulIp(self): # 通过使用多线程用来测试爬取的ip是否可用
self.get_ips() # 首先爬取5页ip
ips=self.proxies_list
threadList=[] # 线程列表
for i in range(5): # 创建5个线程
thread=threading.Thread(target=self.check_ip,args=(ips,))
thread.start()
threadList.append(thread)
for thread in threadList:
thread.join()
self.progressBar()
print('IP test completed!')
print('(The number of available IPs is:[%d])'%len(self.userfulIP))
return self.userfulIP
def get_ips(self):
page=random.randrange(1,3000)
for i in range(page,page+5): # 可以根据实际要求更改
print('=======>Requesting page-{}'.format(i))
url=self.url.format(page)
headers={'User-Agent':useragent().getUserAgent()}
response=requests.get(url=url,headers=headers)
HTML=etree.HTML(response.text)
infos=HTML.xpath("//table[@class='table table-bordered table-striped']/tbody/tr")
for info in infos:
proxies_dict={}
ip=info.xpath('./td[1]/text()')[0] # ip地址
ip_port=info.xpath('./td[2]/text()')[0] # ip的端口
type=info.xpath('./td[4]/text()')[0] # ip的类型 http https
proxies_dict[type.lower()]='{}://{}:{}'.format(type.lower(),ip,ip_port)
self.proxies_list.append(proxies_dict)
# 设置请求完成一次之后,休眠几秒,防止出现请求过快
time.sleep(3)
print('(The total number of IP crawled is:[%d])'%len(self.proxies_list))

Comment ( 0 )

Sign in to post a comment

Python
1
https://gitee.com/il_li/pythonSpider.git
git@gitee.com:il_li/pythonSpider.git
il_li
pythonSpider
Python爬虫
master

Search