[转载]C#程序抓取网页源码实例(winform程序)

[转载]C#程序抓取网页源码实例(winform程序)-纯野.

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Collections;

namespace CopyHtml
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}

private void button1_Click(object sender, EventArgs e)
{
//获取指定网页中的源数据

string rl;
WebRequest Request = WebRequest.Create(textBox1.Text.Trim());

WebResponse Response = Request.GetResponse();

Stream resStream = Response.GetResponseStream();

StreamReader sr = new StreamReader(resStream, Encoding.Default);
StringBuilder sb = new StringBuilder();
while ((rl = sr.ReadLine()) != null)
{
sb.Append(rl);
}
textBox2.Text = sb.ToString();//抓取得到的源网页

string he = textBox2.Text.ToString();

textBox3.Text = stripHtml(he);//去除html标签后得到的源网页

Match TitleMatch = Regex.Match(he, “([^<]*)“, RegexOptions.IgnoreCase | RegexOptions.Multiline);//获取网页的标题
string title = TitleMatch.Groups[1].Value;

textBox4.Text = (“网页的标题是:” + title );

}

///

/// 去掉网页中的html标签
///

/// 待转化的字符串 ///
private string stripHtml(string strHtml)
{
Regex objRegExp = new Regex(“<(.|\n)+?>“);
string strOutput = objRegExp.Replace(strHtml, “”);
strOutput = strOutput.Replace(“<", "<"); strOutput = strOutput.Replace(">“, “>”);
return strOutput;
}

// 提取HTML代码中的网址
public static ArrayList GetHyperLinks(string htmlCode)
{
ArrayList al = new ArrayList();

string strRegex = @”(href)[ ]*=[ ]*[“”‘][^””‘#>]+[“”‘]”;

Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
MatchCollection m = r.Matches(htmlCode);

for (int i = 0; i <= m.Count - 1; i++) { bool rep = false; string strNew = m[i].ToString(); // 过滤重复的URL foreach (string str in al) { if (strNew == str) { rep = true; break; } } if (!rep) al.Add(strNew); } al.Sort(); return al; } } } [/csharp]

赞(0) 打赏
分享到: 更多 (0)

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏