[转载]C#程序抓取网页源码实例(winform程序)-纯野.
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;
namespace CopyHtml
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
//获取指定网页中的源数据
string rl;
WebRequest Request = WebRequest.Create(textBox1.Text.Trim());
WebResponse Response = Request.GetResponse();
Stream resStream = Response.GetResponseStream();
StreamReader sr = new StreamReader(resStream, Encoding.Default);
StringBuilder sb = new StringBuilder();
while ((rl = sr.ReadLine()) != null)
{
sb.Append(rl);
}
textBox2.Text = sb.ToString();//抓取得到的源网页
string he = textBox2.Text.ToString();
textBox3.Text = stripHtml(he);//去除html标签后得到的源网页
Match TitleMatch = Regex.Match(he, “
string title = TitleMatch.Groups[1].Value;
textBox4.Text = (“网页的标题是:” + title );
}
///
///
/// 待转化的字符串
///
private string stripHtml(string strHtml)
{
Regex objRegExp = new Regex(“<(.|\n)+?>“);
string strOutput = objRegExp.Replace(strHtml, “”);
strOutput = strOutput.Replace(“<", "<");
strOutput = strOutput.Replace(">“, “>”);
return strOutput;
}
// 提取HTML代码中的网址
public static ArrayList GetHyperLinks(string htmlCode)
{
ArrayList al = new ArrayList();
string strRegex = @”(href)[ ]*=[ ]*[“”‘][^””‘#>]+[“”‘]”;
Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
MatchCollection m = r.Matches(htmlCode);
for (int i = 0; i <= m.Count - 1; i++) { bool rep = false; string strNew = m[i].ToString(); // 过滤重复的URL foreach (string str in al) { if (strNew == str) { rep = true; break; } } if (!rep) al.Add(strNew); } al.Sort(); return al; } } } [/csharp]