正 文:
C#编程中,如何获取网页源码,自动区分网页编码,如gb2312,utf-8等编码的函数,如下:
需要 using System.Net;
分割----
分割----
分割----
分割----
分割----
分割----
WebClient 方法,缺点没有超时设置,WebClient 继承的默认超时时间是100秒,这个太长了。
private string getHtml(string url, string charSet)
//url是要访问的网站地址,charSet是目标网页的编码
//如果传入的是null或者"",就自动分析网页的编码
{
WebClient myWebClient = new WebClient(); //创建WebClient实例
byte[] myDataBuffer = myWebClient.DownloadData(url);
string strWebData = Encoding.Default.GetString(myDataBuffer);
//获取网页字符编码描述信息
Match charSetMatch = Regex.Match(strWebData, "<meta([^>]*)charset=(\")?(.*)?\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
string webCharSet = charSetMatch.Groups[3].Value;
if (charSet == null || charSet == "") charSet = webCharSet;
if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
{
strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
}
return strWebData;
}
2013.2.4 补充:上面的函数使用的是WebClient ,而WebClient 不能设置超时时间,所以改用HttpWebRequest ,这个可以设置连接超时时间和读取超时时间。
/// <summary>
/// 自动分析网页的编码并获取源码
/// </summary>
/// <param name="url">完整网址</param>
/// <param name="charSet">编码,若为""则自动分析</param>
/// <returns>目标网页源码</returns>
private string getHtml(string url, string charSet)
{
try
{
string strWebData = "";
StreamReader sr;
HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);
myHttpWebRequest.Proxy = null;
myHttpWebRequest.Timeout = 15 * 1000; //连接超时
myHttpWebRequest.Accept = "*/*";
myHttpWebRequest.UserAgent = "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.1 (KHTML, like Gecko) Chrome/21.0.1180.83 Safari/536.1";
myHttpWebRequest.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate; //自动解压gzip
HttpWebResponse myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse();
Stream stream = myHttpWebResponse.GetResponseStream();
//stream.ReadTimeout = 15 * 1000; //读取超时 设置.AutomaticDecompression后不支持超时
//先分析header中编码
string hchart = myHttpWebResponse.Headers["Content-Type"];
Match hchartm = Regex.Match(hchart, "charset=(.*)?", RegexOptions.IgnoreCase);
string hchart1 = hchartm.Groups[1].Value;
if (hchart1 != "")
{
sr = new StreamReader(stream, Encoding.GetEncoding(hchart1));
strWebData = sr.ReadToEnd();
goto endthis;
}
//保存到 MemoryStream 供重复读取
MemoryStream ms = new MemoryStream();
byte[] buffer = new byte[1024];
while (true)
{
int sz = stream.Read(buffer, 0, 1024);
if (sz == 0) break;
ms.Write(buffer, 0, sz);
}
//默认编码读取
ms.Position = 0;//指针置于流开头
if (charSet == "") charSet = "gb2312";
sr = new StreamReader(ms, Encoding.GetEncoding(charSet));
strWebData = sr.ReadToEnd();
//获取网页meta字符编码
Match charSetMatch = Regex.Match(strWebData, "<meta([^>]*)charset=(\")?(.*)?\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
string webCharSet = charSetMatch.Groups[3].Value.ToLower();
if (Encoding.GetEncoding(webCharSet) != Encoding.GetEncoding(charSet) && webCharSet != "")
{
ms.Position = 0;//指针置于流开头
sr = new StreamReader(ms, Encoding.GetEncoding(webCharSet));
strWebData = sr.ReadToEnd();
}
ms.Close();
endthis:
sr.Close(); stream.Close();
myHttpWebResponse.Close(); myHttpWebRequest.Abort();
return strWebData;
}
catch (Exception ex) { return "Error:" + ex.Message; }
}
注意,Stream 流不能重复读取利用,必须把得到的Stream 对象保存到 MemoryStream 流中,然后通过 MemoryStream.Position = 0 以便重复利用这个流。