private void button1_Click(object sender, EventArgs e) { string s1 = this.textBox1.Text; //正则表达式内容 //string match = @"^(http|https|ftp)\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*$"; //string match = @"[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(:[a-zA-Z0-9]*)?/?([a-zA-Z0-9\-\._\?\,\'/\\\+&%\$#\=~])*$"; string match = @"[a-zA-z]+://[^\s]*"; //初始化正则表达式实例 Regex reg = new Regex(match); //开始验证 bool HasValidate = reg.IsMatch(s1); if (HasValidate) { //MessageBox.Show("这是网站有效URL格式。"); try { string tmp = GetHtml(s1); string tmpend = StripHTML(tmp); } catch (Exception) { //MessageBox.Show("3.该网站只能手动查询!"); } } }
1.获取HTML
GetHtml(String Url)
View Code
////// 获取有效的HTML /// /// ///public String GetHtml(String Url) { string sException = null; string sRslt = null; string GBsRslt = null; StreamReader htm = null; WebResponse oWebRps = null; WebResponse bWebRps = null; int a = 0; WebRequest oWebRqst = WebRequest.Create(Url); oWebRqst.Timeout = 50000; WebRequest bWebRqst = WebRequest.Create(Url); bWebRqst.Timeout = 50000; try { oWebRps = oWebRqst.GetResponse(); bWebRps = bWebRqst.GetResponse(); } catch (WebException e) { sException = e.Message.ToString(); MessageBox.Show(sException); } catch (Exception e) { sException = e.ToString(); MessageBox.Show(sException); } finally { if (oWebRps != null) { StreamReader oStreamRd = new StreamReader( oWebRps.GetResponseStream(), Encoding.GetEncoding("UTF-8") ); StreamReader GBoStreamRd = new StreamReader( bWebRps.GetResponseStream(), Encoding.GetEncoding("GB2312") ); sRslt = oStreamRd.ReadToEnd(); GBsRslt = GBoStreamRd.ReadToEnd(); if (!isLuan(sRslt)) //判断utf8是否有乱码 { htm = oStreamRd; } else { htm = GBoStreamRd; } if (htm == oStreamRd) { a = 1; } else { a = 2; } oStreamRd.Close(); GBoStreamRd.Close(); oWebRps.Close(); } } if (a == 1) { return sRslt; } else { return GBsRslt; } }
2.去除HTML标记(正则表达式)
StripHTML(string strHtml)
View Code
1 ///2 /// 去除HTML标记 3 /// 4 /// 包括HTML的源码 5 ///已经去除后的文字 6 public static string StripHTML(string strHtml) 7 { 8 //regex_str="";//替换为空格 9 string regex_str = "(?is) 为空格10 strHtml = Regex.Replace(strHtml, regex_str, "");11 12 //regex_str=" ";//替换 为空格13 regex_str = "(?is) 为空格14 strHtml = Regex.Replace(strHtml, regex_str, "");15 16 //regex_str = "( )+";//替换 为空格17 regex_str = "(?i) ";//替换 为空格18 strHtml = Regex.Replace(strHtml, regex_str, " ");19 20 //regex_str = "(\r\n)*";//替换\r\n为空21 regex_str = @"[\r\n]*";//替换\r\n为空22 strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);23 24 //regex_str = "<[^<]*>";//替换Html标签为空25 regex_str = "<[^<>]*>";//替换Html标签为空26 strHtml = Regex.Replace(strHtml, regex_str, "");27 28 //regex_str = "\n*";//替换\n为空29 regex_str = @"\n*";//替换\n为空30 strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);31 32 //可以这样33 regex_str = "\t*";//替换\t为空34 strHtml = Regex.Replace(strHtml, regex_str, "", RegexOptions.IgnoreCase);35 36 //可以37 regex_str = "'";//替换'为’38 strHtml = Regex.Replace(strHtml, regex_str, "’", RegexOptions.IgnoreCase);39 40 //可以41 regex_str = " +";//替换若干个空格为一个空格42 strHtml = Regex.Replace(strHtml, regex_str, " ", RegexOptions.IgnoreCase);43 44 Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);45 46 string strOutput = regex.Replace(strHtml, "");//替换掉"<"和">"之间的内容47 strOutput = strOutput.Replace("<", "");48 strOutput = strOutput.Replace(">", "");49 strOutput = strOutput.Replace(" ", "");50 51 52 return strOutput;53 54 }
3.判断是否为乱码(编码):在StripHTML里调用。
View Code
//判断是否为乱码 bool isLuan(string txt) { var bytes = Encoding.UTF8.GetBytes(txt); //239 191 189 for (var i = 0; i < bytes.Length; i++) { if (i < bytes.Length - 3) if (bytes[i] == 239 && bytes[i + 1] == 191 && bytes[i + 2] == 189) { return true; } } return false; }