6 Star 0 Fork 0

hi / webCollection

Create your Gitee Account
Explore and code with more than 12 million developers,Free private repositories !:)
Sign up
This repository doesn't specify license. Please pay attention to the specific project description and its upstream code dependency when using it.
Clone or Download
crawler.aspx.cs 5.10 KB
Copy Edit Raw Blame History
kangqiang authored 2019-12-06 16:59 . fix utf-8
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Xml.Linq;
public partial class crawler : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
string username;
bool isCookiesUsernameNull = (Request.Cookies["username"] == null);
if (isCookiesUsernameNull)
{
return;//do nothing
}
else if (Request.Cookies["username"].Value == "undefined")
{
return; //do nothing
}
else
{
username = Request.Cookies["username"].Value;
}
string pageHtml = Server.UrlDecode(Request.Form.ToString());
LOG.writer("crawler data: 10 " + pageHtml);
if (pageHtml.IndexOf("http:") == 0)
{
LOG.writer("crawler data: 11");
try
{
WebClient MyWebClient = new WebClient();
MyWebClient.Credentials = CredentialCache.DefaultCredentials;
MyWebClient.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
Byte[] pageData = MyWebClient.DownloadData(pageHtml.Trim()); //从指定网站下载数据
pageHtml = System.Text.Encoding.Default.GetString(pageData); //如果获取网站页面采用的是utf-8,则使用这句
if (pageHtml.IndexOf("utf-8") > 0)
{
pageHtml = System.Text.Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句
}
}
catch (Exception ex)
{
LOG.writer("exception crawler: " + ex.Message);
}
}
importData(pageHtml, username);
Response.Write("1");//successed
}
public void importData(string p,string username)
{
string filename = MapPath("") + "//userdata//" + username + "//sendData.json";
if (File.Exists(filename))
{
LOG.writer("crawler data: " + filename);
Regex reg = new Regex(@"<a\s*[^>]*>([\s\S]+?)</a>", RegexOptions.IgnoreCase);
Match m = reg.Match(p);
string directory = "";
if (Request.QueryString != null && Request.QueryString["flag"] != null)
{
directory = Request.QueryString["flag"];
}
else
{
directory = "main";
}
Xmlserdeser xser = new Xmlserdeser(MapPath("") + "//userdata//", username,directory);
page myPage = xser.Mpage;
int index = myPage.Mtables.Count;
bool isEndFlag = false;
Random r = new Random();
table tmpTable = new table(DateTime.Now.Millisecond.ToString() + r.Next().ToString() + "TB", "新导入数据", "");
tmpTable.Mrows = new List<row>();
myPage.Mtables.Add(tmpTable);
while (true)
{
int len = 0;
row tmpRow = new row(DateTime.Now.Millisecond.ToString() + r.Next().ToString() + "CATE", ".", "0");
tmpRow.Mrowwebs = new List<web>();
tmpTable.Mrows.Add(tmpRow);
while (true)
{
string href = Regex.Match(m.Value, "href=[\'\"]{0,}.*[\'\"]{0,}", RegexOptions.IgnoreCase).Value;
href = Regex.Replace(href, "href=[\'\"]*", "", RegexOptions.IgnoreCase);
href = Regex.Replace(href, "[\'\"].*", "", RegexOptions.IgnoreCase);
href = Regex.Replace(href, @"\s.*", "", RegexOptions.IgnoreCase);
if (m.Success == false) continue;
string key = m.Result("$1");// 得到正则的括号里的内容,就是a的innerHTML
key = Regex.Replace(key, @"<[^>]*>", "", RegexOptions.IgnoreCase);// 替换掉里面的html,只保留文字
if (len > 550)
{
len = 0;
break;
}
else
{
tmpRow.Mrowwebs.Add(new web(DateTime.Now.Millisecond.ToString() + r.Next().ToString() + "WEB", key, href, ".", "", "", "", ""));
len = len + Regex.Replace(key, "[^\x00-\xff]", "aa").Length * 8 + 5;
}
m = m.NextMatch();// 循环匹配html里的下一个结果
isEndFlag = !m.Success;
if (isEndFlag) break;
}
if (isEndFlag) break;
}
xser.write(myPage, xser.XMLFilename);
xser.simplifyXmlandJSON();
}
else return;
}
}
1
https://gitee.com/yanglihao2006/webCollection.git
git@gitee.com:yanglihao2006/webCollection.git
yanglihao2006
webCollection
webCollection
master

Search