zl程序教程

您现在的位置是:首页 >  后端

当前栏目

asp.net(c#)做一个网页数据采集工具

2023-06-13 09:14:15 时间
通过这个软件一两天就完成了几千产品数据的录入,可见很多工作不是一味用人工去做,作为一个程序员,就是要让很多让那些经常做重复性的、繁琐的工作中的人解放出来。下面只是写了一些核心代码,而且采集必须要和对应网站相挂钩,作者:郑少群

复制代码代码如下:

//提取产品列表页中产品最终页的网页
privatevoidbutton1_Click(objectsender,EventArgse)
{
if(textBox1.Text.Trim()==""||textBox2.Text.Trim()=="")
{
MessageBox.Show("网址和域名不能为空!","信息提示",MessageBoxButtons.OK,MessageBoxIcon.Information);
return;
}
try
{
stringHtml=inc.GetHtml("http://study.pctoday.net.cn");
//ArrayListal=inc.GetMatchesStr(Html,"<a[^>]*?>.*?</a>");
ArrayListal=inc.GetMatchesStr(Html,@"href\s*=\s*(?:[\"\""\s](?<1>[^\""\"]*)[\"\""])");//提取链接


"title="ReplicaWatches:">ReplicaWatchesBuyFullQualityPopularLuxuryWatchesatAmazingPrice,YourOneStopDiscountSwissWatchesStoreExclusiveReplicaRolexWatches,TagHeuerWatchesReplica,CartierWatchesonlineSale!
StringBuildersb=newStringBuilder();
foreach(objectvarinal)
{
stringa=var.ToString().Replace("\"","").Replace(""","");
a=Regex.Replace(a,"href=","",RegexOptions.IgnoreCase|RegexOptions.Multiline);
if(a.StartsWith("/"))
a=textBox2.Text.Trim()+a;
if(!a.StartsWith("http://"))
a="http://"+a;
sb.Append(a+"\r\n");
}
textBox5.Text=sb.ToString();//把提取到网址输出到一个textBox,每个链接占一行



MessageBox.Show("共提取"+al.Count.ToString()+"个链接","信息提示",MessageBoxButtons.OK,MessageBoxIcon.Information);

}
catch(Exceptionerr)
{
MessageBox.Show("提取出错!原因:"+err.Message,"信息提示",MessageBoxButtons.OK,MessageBoxIcon.Information);
}

}




//把采集的产品页面html代码进行字符串处理,提取需要的代码,最后保存到本地一个access数据库中,同时提取产品图片地址并自动现在图片到本地images文件夹下

privatevoidbackgroundWorker1_DoWork(objectsender,DoWorkEventArgse)
{
//填充产品表
Database.ExecuteNonQuery("deletefromTb_Product");
DataTabledt2=newDataTable();
OleDbConnectionconn=newOleDbConnection(Database.ConnectionStrings);
OleDbDataAdapterda=newOleDbDataAdapter("select*fromTb_Product",conn);
OleDbCommandBuildercb=newOleDbCommandBuilder(da);
da.Fill(dt2);
dt2.Rows.Clear();

BackgroundWorkerworker=(BackgroundWorker)sender;//这个是做一个进度条

string[]Urls=textBox5.Text.Trim().ToLower().Replace("\r\n",",").Split(",");
DataTabledt=newDataTable();
StringBuilderErrorStr=newStringBuilder();
stringhtml="",ImageDir=AppDomain.CurrentDomain.BaseDirectory+"Images\\";

//循环每次采集网址
for(inti=0;i<Urls.Length;i++)
{
try
{
if(!worker.CancellationPending)
{
if(Urls[i]=="")
return;
html=inc.GetHtml(Urls[i]);//获取该url的html代码
DataRowNewRow=dt2.NewRow();

//产品名
stringProductName=html.Substring(html.IndexOf("<title>")+7);
NewRow["ProductName"]=ProductName.Remove(ProductName.IndexOf("</title>")).Trim();

//产品编号
NewRow["ModelId"]=NewRow["ProductName"].ToString().Substring(NewRow["ProductName"].ToString().IndexOf("Model:")+6).Trim();

//产品介绍,这些都是根据不同网站的html做相应的修改
stringIntroduce=html.Substring(html.IndexOf("ProductDetails")+26);
Introduce=Introduce.Remove(Introduce.IndexOf("</table>")+8).Trim()

NewRow["Introduce"]=Introduce;



"title="ReplicaWatches:">ReplicaWatchesBuyFullQualityPopularLuxuryWatchesatAmazingPrice,YourOneStopDiscountSwissWatchesStoreExclusiveReplicaRolexWatches,TagHeuerWatchesReplica,CartierWatchesonlineSale!
//下载图片
stringProductImage=html.Substring(html.IndexOf("align=center><img")+17);
ProductImage=textBox2.Text.Trim()+ProductImage.Substring(ProductImage.IndexOf("src=\"")+5);
ProductImage=ProductImage.Remove(ProductImage.IndexOf("\""));
try
{
inc.DownFile(ProductImage,ImageDir+ProductImage.Substring(ProductImage.LastIndexOf("/")+1));
}
catch(Exception)
{
ErrorStr.Append("下载图片失败,图片地址:"+ImageDir+ProductImage.Substring(ProductImage.LastIndexOf("/")+1)+"\r\n");
}


dt2.Rows.Add(NewRow);

//Thread.Sleep(100);
worker.ReportProgress((i+1)*100/Urls.Length,i);
toolStripStatusLabel1.Text="处理进度:"+(i+1).ToString()+"/"+Urls.Length.ToString();//进度条
}

}
catch(Exceptionerr)
{
ErrorStr.Append("采集错误:"+err.Message+";网址:"+Urls[i]+"\r\n");
}
}
da.Update(dt2);
DataBind(dt2);
ShowError(ErrorStr.ToString());
}

///<summary>
///ASPX页面生成静态Html页面,作者:郑少群
///</summary>
publicstaticstringGetHtml(stringurl)
{
StreamReadersr=null;
stringstr=null;
//读取远程路径
WebRequestrequest=WebRequest.Create(url);
HttpWebResponseresponse=(HttpWebResponse)request.GetResponse();
sr=newStreamReader(response.GetResponseStream(),Encoding.GetEncoding(response.CharacterSet));
str=sr.ReadToEnd();
sr.Close();
returnstr;
}


//提取HTML代码中的网址
publicstaticArrayListGetMatchesStr(stringhtmlCode,stringstrRegex)
{
ArrayListal=newArrayList();

Regexr=newRegex(strRegex,RegexOptions.IgnoreCase|RegexOptions.Multiline);
MatchCollectionm=r.Matches(htmlCode);

for(inti=0;i<m.Count;i++)
{
boolrep=false;
stringstrNew=m[i].ToString();

//过滤重复的URL
foreach(stringstrinal)
{
if(strNew==str)
{
rep=true;
break;
}
}

if(!rep)al.Add(strNew);
}

al.Sort();

returnal;
}

publicstaticvoidDownFile(stringUrl,stringPath)
{

HttpWebRequestrequest=(HttpWebRequest)WebRequest.Create(Url);
HttpWebResponseresponse=(HttpWebResponse)request.GetResponse();
Streamstream=response.GetResponseStream();
longsize=response.ContentLength;
//创建文件流对象
using(FileStreamfs=newFileStream(Path,FileMode.OpenOrCreate,FileAccess.Write))
{
byte[]b=newbyte[1025];
intn=0;
while((n=stream.Read(b,0,1024))>0)
{
fs.Write(b,0,n);
}
}
}