using mshtml; using HtmlAgilityPack; class HTMLCrawler { private PhaseResultBean PhaseHtml(int index, Uri uri, String szResultPath, String szErrorPath, HTMLEnginType htmlEngin) { PhaseResultBean result = new PhaseResultBean(); try { WebClient client = new WebClient(); client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)"); Byte[] pageData = client.DownloadData(uri); string pageHtml = Encoding.UTF8.GetString(pageData); if (checkSavePages.Checked) { String szHtmlPath = XWin32.getExeParentPath() + index.ToString()+".html"; using (StreamWriter sw = new StreamWriter(szHtmlPath, true)) { sw.WriteLine(pageHtml); } } switch(htmlEngin) { case HTMLEnginType.HTMLEngin_mshtml: PhaseHtml_mshtml(pageHtml, szResultPath, szErrorPath, result); break; case HTMLEnginType.HTMLEngin_HtmlAgilityPack: PhaseHtml_HtmlAgilityPack(pageHtml, szResultPath, szErrorPath, result); break; } } catch (WebException webEx) { using (StreamWriter sw = new StreamWriter(szErrorPath, true)) { sw.WriteLine(webEx.Message); } result.bSuccess = false; } return result; } private void PhaseHtml_mshtml(String pageHtml, String szResultPath, String szErrorPath, PhaseResultBean result) { mshtml.HTMLDocument docObject = new mshtml.HTMLDocument(); mshtml.IHTMLDocument2 doc2 = docObject as mshtml.IHTMLDocument2; doc2.write(pageHtml); doc2.close(); mshtml.IHTMLDocument3 doc3 = docObject as mshtml.IHTMLDocument3; int len = doc3.getElementById("shop-all-list").children[0].children.length; result.total += len; foreach (IHTMLElement li in doc3.getElementById("shop-all-list").children[0].children) { try { IHTMLElement title = li.children[1].children[0]; String szTitle = title.innerText; if (szTitle != null) szTitle = szTitle.Replace("\r\n", "-"); IHTMLElement star = li.children[1].children[1].children[0]; String szStar = star.getAttribute("title"); IHTMLElement reviewNum = li.children[1].children[1].children[1]; String szReviewNum = reviewNum.innerText; IHTMLElement meanPrice = li.children[1].children[1].children[3]; String szMeanPrice = meanPrice.innerText; IHTMLElement category = li.children[1].children[2].children[0]; String szCategory = category.innerText; IHTMLElement address = li.children[1].children[2].children[3]; String szAddress = address.innerText; if (szAddress != null) szAddress.Replace(",", "-"); IHTMLElement taste = li.children[1].children[3].children[0]; String szTaste = taste.innerText; IHTMLElement evn = li.children[1].children[3].children[1]; String szEvn = evn.innerText; IHTMLElement service = li.children[1].children[3].children[2]; String szService = service.innerText; //将获取的内容写入文本 using (StreamWriter sw = new StreamWriter(szResultPath, true)) { sw.WriteLine(szTitle + "," + szStar + "," + szReviewNum + "," + szMeanPrice + "," + szCategory + "," + szAddress + "," + szTaste + "," + szEvn + "," + szService); } } catch (Exception Ex) { using (StreamWriter sw = new StreamWriter(szErrorPath, true)) { sw.WriteLine(Ex.Message); } result.failed += 1; } } } private void PhaseHtml_HtmlAgilityPack(String pageHtml, String szResultPath, String szErrorPath, PhaseResultBean result) { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(pageHtml); HtmlAgilityPack.HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("/html[1]/body[1]/div[4]/div[3]/div[1]/div[1]/div[2]/ul[1]/li"); result.total += nodes.Count; foreach (HtmlAgilityPack.HtmlNode li in nodes) { try { HtmlAgilityPack.HtmlNode titleA = li.SelectNodes("div[2]/div[1]/a[1]")[0]; HtmlAgilityPack.HtmlNode titleB = li.SelectNodes("div[2]/div[1]/a[2]") == null ? null : li.SelectNodes("div[2]/div[1]/a[2]")[0]; String szTitle = (titleA==null?"":titleA.InnerText) + "-" + (titleB == null ? "" : titleB.InnerText); if (szTitle != null) szTitle = szTitle.Replace("\n", ""); if (szTitle != null) szTitle = szTitle.Replace(" ", ""); HtmlAgilityPack.HtmlNode star = li.SelectNodes("div[2]/div[2]/span[1]")[0]; String szStar = star.Attributes["title"].Value.ToString(); HtmlAgilityPack.HtmlNode reviewNum = li.SelectNodes("div[2]/div[2]/a[1]")[0]; String szReviewNum = reviewNum.InnerText; if (szReviewNum != null) szReviewNum = szReviewNum.Replace("\n", ""); if (szReviewNum != null) szReviewNum = szReviewNum.Replace(" ", ""); HtmlAgilityPack.HtmlNode meanPrice = li.SelectNodes("div[2]/div[2]/a[2]")[0]; String szMeanPrice = meanPrice.InnerText; if (szMeanPrice != null) szMeanPrice = szMeanPrice.Replace("\n", ""); if (szMeanPrice != null) szMeanPrice = szMeanPrice.Replace(" ", ""); HtmlAgilityPack.HtmlNode category = li.SelectNodes("div[2]/div[3]/a[1]")[0]; String szCategory = category.InnerText; HtmlAgilityPack.HtmlNode addressA = li.SelectNodes("div[2]/div[3]/a[2]")[0]; HtmlAgilityPack.HtmlNode addressB = li.SelectNodes("div[2]/div[3]/span[1]")[0]; String szAddress = addressA.InnerText + "-" + addressB.InnerText; if (szAddress != null) szAddress.Replace(",", "-"); HtmlAgilityPack.HtmlNode taste = li.SelectNodes("div[2]/span[1]/span[1]")[0]; String szTaste = taste.InnerText; HtmlAgilityPack.HtmlNode evn = li.SelectNodes("div[2]/span[1]/span[2]")[0]; String szEvn = evn.InnerText; HtmlAgilityPack.HtmlNode service = li.SelectNodes("div[2]/span[1]/span[3]")[0]; String szService = service.InnerText; //将获取的内容写入文本 using (StreamWriter sw = new StreamWriter(szResultPath, true)) { sw.WriteLine(szTitle + "," + szStar + "," + szReviewNum + "," + szMeanPrice + "," + szCategory + "," + szAddress + "," + szTaste + "," + szEvn + "," + szService); } } catch (Exception Ex) { using (StreamWriter sw = new StreamWriter(szErrorPath, true)) { sw.WriteLine(Ex.Message); } result.failed += 1; } } } } class PhaseResultBean { public Boolean bSuccess; public int total; public int successed; public int failed; } public enum HTMLEnginType { HTMLEngin_mshtml, HTMLEngin_HtmlAgilityPack }