Fetch the repository succeeded.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Xml.Linq;
public partial class crawler : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
string username;
bool isCookiesUsernameNull = (Request.Cookies["username"] == null);
if (isCookiesUsernameNull)
{
return;//do nothing
}
else if (Request.Cookies["username"].Value == "undefined")
{
return; //do nothing
}
else
{
username = Request.Cookies["username"].Value;
}
string pageHtml = Server.UrlDecode(Request.Form.ToString());
LOG.writer("crawler data: 10 " + pageHtml);
if (pageHtml.IndexOf("http:") == 0)
{
LOG.writer("crawler data: 11");
try
{
WebClient MyWebClient = new WebClient();
MyWebClient.Credentials = CredentialCache.DefaultCredentials;
MyWebClient.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
Byte[] pageData = MyWebClient.DownloadData(pageHtml.Trim()); //从指定网站下载数据
pageHtml = System.Text.Encoding.Default.GetString(pageData); //如果获取网站页面采用的是utf-8,则使用这句
if (pageHtml.IndexOf("utf-8") > 0)
{
pageHtml = System.Text.Encoding.UTF8.GetString(pageData); //如果获取网站页面采用的是UTF-8,则使用这句
}
}
catch (Exception ex)
{
LOG.writer("exception crawler: " + ex.Message);
}
}
importData(pageHtml, username);
Response.Write("1");//successed
}
public void importData(string p,string username)
{
string filename = MapPath("") + "//userdata//" + username + "//sendData.json";
if (File.Exists(filename))
{
LOG.writer("crawler data: " + filename);
Regex reg = new Regex(@"<a\s*[^>]*>([\s\S]+?)</a>", RegexOptions.IgnoreCase);
Match m = reg.Match(p);
string directory = "";
if (Request.QueryString != null && Request.QueryString["flag"] != null)
{
directory = Request.QueryString["flag"];
}
else
{
directory = "main";
}
Xmlserdeser xser = new Xmlserdeser(MapPath("") + "//userdata//", username,directory);
page myPage = xser.Mpage;
int index = myPage.Mtables.Count;
bool isEndFlag = false;
Random r = new Random();
table tmpTable = new table(DateTime.Now.Millisecond.ToString() + r.Next().ToString() + "TB", "新导入数据", "");
tmpTable.Mrows = new List<row>();
myPage.Mtables.Add(tmpTable);
while (true)
{
int len = 0;
row tmpRow = new row(DateTime.Now.Millisecond.ToString() + r.Next().ToString() + "CATE", ".", "0");
tmpRow.Mrowwebs = new List<web>();
tmpTable.Mrows.Add(tmpRow);
while (true)
{
string href = Regex.Match(m.Value, "href=[\'\"]{0,}.*[\'\"]{0,}", RegexOptions.IgnoreCase).Value;
href = Regex.Replace(href, "href=[\'\"]*", "", RegexOptions.IgnoreCase);
href = Regex.Replace(href, "[\'\"].*", "", RegexOptions.IgnoreCase);
href = Regex.Replace(href, @"\s.*", "", RegexOptions.IgnoreCase);
if (m.Success == false) continue;
string key = m.Result("$1");// 得到正则的括号里的内容,就是a的innerHTML
key = Regex.Replace(key, @"<[^>]*>", "", RegexOptions.IgnoreCase);// 替换掉里面的html,只保留文字
if (len > 550)
{
len = 0;
break;
}
else
{
tmpRow.Mrowwebs.Add(new web(DateTime.Now.Millisecond.ToString() + r.Next().ToString() + "WEB", key, href, ".", "", "", "", ""));
len = len + Regex.Replace(key, "[^\x00-\xff]", "aa").Length * 8 + 5;
}
m = m.NextMatch();// 循环匹配html里的下一个结果
isEndFlag = !m.Success;
if (isEndFlag) break;
}
if (isEndFlag) break;
}
xser.write(myPage, xser.XMLFilename);
xser.simplifyXmlandJSON();
}
else return;
}
}
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。