domain-crawling
/
domain-crawling.py

import urllib
import itertools
import json
import sys
import time
import socket
import requests
import time
import argparse

## An API for getting domain name information
domainapi = 'https://cloud.baidu.com/api/bcd/search/status'

## The domain data5u.com order id
## I'm going to change this id to my own
orderid = 'd6745b2f3ced66d5c21621f43dc65f5d'

## You can get the API for the proxy IP
proxyapi = 'http://api.ip.data5u.com/dynamic/get.html?order=' + orderid

## Total length of domain name searched
domainlistlength = 0

"""
Split the list
"""
def chunks(l, n):
	return [l[i:i + n] for i in range(0, len(l), n)]

"""
Crawling by domain length
"""
def crawling(length, suffix, path, openproxy, delayed):

	# Open stroge file
	strogefile = open(path + ".txt", 'wb+')

	# abcdefghigklmnopqrstuvwxyz
	# Gets all the domain combinations in a-z0-9
	domainlist = list(itertools.product('abcdefghigklmnopqrstuvwxyz0123456789', repeat = length))

	# Gets domain list length
	global domainlistlength
	domainlistlength = len(domainlist)
	print('Prepare to retrieve domain name length:', domainlistlength)

	# Block
	#domainlistblocks = chunks(domainlist, 3)
	index = 0
	domainNames = []
	for domain in domainlist:
		try:
			if index % 3 == 0:
				pushget(domainNames, openproxy, index, strogefile)
				domainNames = []
				time.sleep(delayed)

			label = {
				"label": "".join(domain),
				"tld": suffix
			}
			domainNames.append(label)
		except:
			print('An exception has occurred：', sys.exc_info()[0])
		index += 1

	strogefile.close()

"""
Push the request data to domain API
"""
def pushget(domainNames, openproxy, index, strogefile):
	# When the agent is opened, the agent is changed every ten times
	if openproxy and index % 6 == 0:
		proxyip = requests.get(proxyapi)
		proxyip = proxyip.text.replace('\n', '')
		proxy={
			"http":"http://" + proxyip,
			"https":"http://" + proxyip
		}
		proxy = urllib.request.ProxyHandler(proxy)
		opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
		urllib.request.install_opener(opener)
		print('Proxy by IP: ', proxyip)
		# Get the agent no more than 10 times per second
		time.sleep(0.1)

	# Construct request data
	requestdata = {
		"domainNames":domainNames
	}
	requestdatajson = str(json.dumps(requestdata))

	# Construction request header
	headers = {
	    'Cache-Control':'no-cache',
	    'Content-Encoding':'gzip',
	    'Content-Length':len(requestdatajson),
	    'Connection':'keep-alive',
	    'Content-Type':'application/json;charset=UTF-8',
	    'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36'
	}

	# Initiate a request for data
	request = urllib.request.Request(domainapi, data = bytes(requestdatajson, encoding='utf-8'), headers = headers)
	result = urllib.request.urlopen(request)
	data = result.read()
	data = data.decode()
	data = json.loads(data)

	if data.get('status') == 200:
		domaininfos = data.get('result').get('accurate')
		for domaininfo in domaininfos:
			if domaininfo.get('status') == 'UNREGISTERED':
				strogefile.writelines([str(domaininfo).encode(), "\r\n".encode()])
				strogefile.flush()
			print(domaininfo, "-", index, "/", domainlistlength)


if __name__ == "__main__":

	parser = argparse.ArgumentParser(description='Nico domain name crawler script')
	parser.add_argument('-p', '--path', help='The available domain name storage path after detection.')
	parser.add_argument('-l', '--length', type = int, help ='The length of the domain you want to detect is all combinations of a-z0-9.')
	parser.add_argument('-o', '--openproxy', choices = ['y', 'n'], help='Open the IP proxy mode.')
	parser.add_argument('-d', '--delayed', type = int, help='The interval between each climb, Unit s')
	parser.add_argument('-s', '--suffix', help='Domain suffix')
	args = parser.parse_args()

	if not args.length:
		args.length = 4
	if not args.path:
		args.path = str(int(time.time() * 1000))
	if not args.delayed:
		args.delayed = 0.1
	if not args.suffix:
		args.suffix = 'com'

	if not args.openproxy:
		args.openproxy = False
	elif args.openproxy == 'y':
		args.openproxy = True
	else:
		args.openproxy = False

	crawling(length = args.length, suffix = args.suffix, path = args.path, openproxy = args.openproxy, delayed = args.delayed)