# -*- coding: UTF-8 -*- ''' Created on 20150206 @author: Hansen ''' import urllib2 import sys import io import json #Fetch HTML from URL def fecth_html(index,url,keepHtml,resultFile): req = urllib2.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0') rsp = urllib2.urlopen(req) content = rsp.read() #receive_header = rsp.info() #print(sys.getfilesystemencoding()) #content = content.decode('utf-8','replace') if keepHtml: fileinfo = open(str(index)+'.html','w') fileinfo.write(content) print("save file "+ str(index)+'.html: ok') parse_json(content,resultFile) #Parse HTML def parse_json(content,resultFile): jsonData = json.loads(content) shops = jsonData['shopBeans'] print(len(shops)) for shop in shops: szTitle = shop['filterFullName'] szTitle = szTitle.replace("\r\n", "-").replace(" ",""); szStar = shop['shopPowerTitle'] szMeanPrice = str(shop['avgPrice']) szMeanPrice = szMeanPrice.replace("\n", "").replace(" ",""); szAddressA = shop['mainRegionName'] szAddressB = shop['address'] szAddress = (szAddressA+"-"+szAddressB).replace("\r\n", "-").replace(" ",""); szTaste = shop['refinedScore1'] szEvn = shop['refinedScore2'] szService = shop['refinedScore3'] fileinfo = io.open(resultFile,'a',encoding='utf_16') fileinfo.write(szTitle+","+szStar+","+szMeanPrice+","+szAddress+"," +szTaste+","+szEvn+","+szService+"\n")
Category Archives: WebCrawler
CSharp抓取JSON网页内容
using Newtonsoft.Json; using DaZhongDianPing.JsonBeans; class JsonCrawler { private PhaseResultBean PhaseJson(Uri uri, String szResultPath, String szErrorPath) { PhaseResultBean result = new PhaseResultBean(); try { //取回网页 WebClient client = new WebClient(); client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)"); Byte[] pageData = client.DownloadData(uri); string pageHtml = Encoding.UTF8.GetString(pageData); JsonBeans.TopShopsBean topShops = JsonConvert.DeserializeObject<JsonBeans.TopShopsBean>(pageHtml); //分析Json int len = topShops.ShopBeans.Length; result.total = len; foreach (ShopBean shop in topShops.ShopBeans) { try { String szTitle = shop.FilterFullName; if (szTitle != null) szTitle = szTitle.Replace("\r\n", "-"); String szStar = shop.ShopPowerTitle; String szMeanPrice = shop.AvgPrice.ToString(); String szRegionName = shop.MainRegionName; String szAddress = shop.Address; if (szAddress != null) szAddress.Replace(",", "-"); String szTaste = shop.RefinedScore1; String szEvn = shop.RefinedScore2; String szService = shop.RefinedScore3; //将获取的内容写入文本 using (StreamWriter sw = new StreamWriter(szResultPath, true)) { sw.WriteLine(szTitle + "," + szStar + "," + szMeanPrice + "," + szRegionName + "," + szAddress + "," + szTaste + "," + szEvn + "," + szService); } result.successed += 1; } catch (Exception Ex) { using (StreamWriter sw = new StreamWriter(szErrorPath, true)) { sw.WriteLine(Ex.Message); } result.failed += 1; } } } catch (WebException webEx) { using (StreamWriter sw = new StreamWriter(szErrorPath, true)) { sw.WriteLine(webEx.Message); } result.bSuccess = false; } return result; } } class PhaseResultBean { public Boolean bSuccess; public int total; public int successed; public int failed; } public enum JsonEnginType { JsonEngin_Newtonsoft } internal class ShopBean { [JsonProperty("addDate")] public string AddDate { get; set; } [JsonProperty("addUser")] public object AddUser { get; set; } [JsonProperty("addUserName")] public object AddUserName { get; set; } [JsonProperty("address")] public string Address { get; set; } [JsonProperty("altName")] public string AltName { get; set; } [JsonProperty("avgPrice")] public int AvgPrice { get; set; } [JsonProperty("branchName")] public string BranchName { get; set; } [JsonProperty("branchTotal")] public int BranchTotal { get; set; } [JsonProperty("businessHours")] public string BusinessHours { get; set; } [JsonProperty("canSendSms")] public object CanSendSms { get; set; } [JsonProperty("categoryId")] public int CategoryId { get; set; } [JsonProperty("cityId")] public int CityId { get; set; } [JsonProperty("crossRoad")] public string CrossRoad { get; set; } [JsonProperty("defaultPic")] public string DefaultPic { get; set; } [JsonProperty("defaultPicBig")] public object DefaultPicBig { get; set; } [JsonProperty("dishTagList")] public string[][] DishTagList { get; set; } [JsonProperty("dishTags")] public string DishTags { get; set; } [JsonProperty("district")] public int District { get; set; } [JsonProperty("districtName")] public object DistrictName { get; set; } [JsonProperty("filterFullAdress")] public string FilterFullAdress { get; set; } [JsonProperty("filterFullName")] public string FilterFullName { get; set; } [JsonProperty("firstReviewId")] public int FirstReviewId { get; set; } [JsonProperty("firstUserFace")] public object FirstUserFace { get; set; } [JsonProperty("firstUserNickName")] public object FirstUserNickName { get; set; } [JsonProperty("fullAdress")] public string FullAdress { get; set; } [JsonProperty("fullName")] public string FullName { get; set; } [JsonProperty("glat")] public object Glat { get; set; } [JsonProperty("glng")] public object Glng { get; set; } [JsonProperty("groupFlag")] public object GroupFlag { get; set; } [JsonProperty("hasStaticMap")] public object HasStaticMap { get; set; } [JsonProperty("hits")] public int Hits { get; set; } [JsonProperty("isUserCanUpdate")] public object IsUserCanUpdate { get; set; } [JsonProperty("lastDate")] public string LastDate { get; set; } [JsonProperty("lastIp")] public object LastIp { get; set; } [JsonProperty("lastUser")] public object LastUser { get; set; } [JsonProperty("lastUserName")] public object LastUserName { get; set; } [JsonProperty("mainCategoryId")] public int MainCategoryId { get; set; } [JsonProperty("mainCategoryName")] public object MainCategoryName { get; set; } [JsonProperty("mainRegionId")] public int MainRegionId { get; set; } [JsonProperty("mainRegionName")] public string MainRegionName { get; set; } [JsonProperty("minUserMana")] public object MinUserMana { get; set; } [JsonProperty("monthlyHits")] public int MonthlyHits { get; set; } [JsonProperty("nearByTags")] public object NearByTags { get; set; } [JsonProperty("nearbyShops")] public object NearbyShops { get; set; } [JsonProperty("oldChainId")] public object OldChainId { get; set; } [JsonProperty("phoneNo")] public string PhoneNo { get; set; } [JsonProperty("phoneNo2")] public string PhoneNo2 { get; set; } [JsonProperty("picTotal")] public int PicTotal { get; set; } [JsonProperty("popularity")] public int Popularity { get; set; } [JsonProperty("power")] public int Power { get; set; } [JsonProperty("prevWeeklyHits")] public object PrevWeeklyHits { get; set; } [JsonProperty("priceInfo")] public object PriceInfo { get; set; } [JsonProperty("priceLevel")] public int PriceLevel { get; set; } [JsonProperty("primaryTag")] public string PrimaryTag { get; set; } [JsonProperty("promoId")] public int PromoId { get; set; } [JsonProperty("publicTransit")] public string PublicTransit { get; set; } [JsonProperty("refinedScore1")] public string RefinedScore1 { get; set; } [JsonProperty("refinedScore2")] public string RefinedScore2 { get; set; } [JsonProperty("refinedScore3")] public string RefinedScore3 { get; set; } [JsonProperty("regionId")] public int RegionId { get; set; } [JsonProperty("score")] public int Score { get; set; } [JsonProperty("score1")] public int Score1 { get; set; } [JsonProperty("score2")] public int Score2 { get; set; } [JsonProperty("score3")] public int Score3 { get; set; } [JsonProperty("score4")] public int Score4 { get; set; } [JsonProperty("searchKeyWord")] public object SearchKeyWord { get; set; } [JsonProperty("searchName")] public object SearchName { get; set; } [JsonProperty("shopGroupId")] public int ShopGroupId { get; set; } [JsonProperty("shopId")] public int ShopId { get; set; } [JsonProperty("shopName")] public string ShopName { get; set; } [JsonProperty("shopPower")] public int ShopPower { get; set; } [JsonProperty("shopPowerTitle")] public string ShopPowerTitle { get; set; } [JsonProperty("shopTagList")] public string[][] ShopTagList { get; set; } [JsonProperty("shopTags")] public string ShopTags { get; set; } [JsonProperty("shopTotalName")] public string ShopTotalName { get; set; } [JsonProperty("shopType")] public int ShopType { get; set; } [JsonProperty("similarShops")] public object SimilarShops { get; set; } [JsonProperty("suggestGA")] public object SuggestGA { get; set; } [JsonProperty("suggestReason")] public object SuggestReason { get; set; } [JsonProperty("todayHits")] public object TodayHits { get; set; } [JsonProperty("voteTotal")] public int VoteTotal { get; set; } [JsonProperty("webSite")] public object WebSite { get; set; } [JsonProperty("weeklyHits")] public int WeeklyHits { get; set; } [JsonProperty("wishTotal")] public object WishTotal { get; set; } [JsonProperty("writeUp")] public string WriteUp { get; set; } } internal class TopShopsBean { [JsonProperty("categoryId")] public int CategoryId { get; set; } [JsonProperty("cityId")] public int CityId { get; set; } [JsonProperty("maxResults")] public int MaxResults { get; set; } [JsonProperty("rankType")] public int RankType { get; set; } [JsonProperty("shopBeans")] public ShopBean[] ShopBeans { get; set; } [JsonProperty("shopType")] public int ShopType { get; set; } [JsonProperty("skipResults")] public int SkipResults { get; set; } }
CSharp抓取HTML网页内容
using mshtml; using HtmlAgilityPack; class HTMLCrawler { private PhaseResultBean PhaseHtml(int index, Uri uri, String szResultPath, String szErrorPath, HTMLEnginType htmlEngin) { PhaseResultBean result = new PhaseResultBean(); try { WebClient client = new WebClient(); client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)"); Byte[] pageData = client.DownloadData(uri); string pageHtml = Encoding.UTF8.GetString(pageData); if (checkSavePages.Checked) { String szHtmlPath = XWin32.getExeParentPath() + index.ToString()+".html"; using (StreamWriter sw = new StreamWriter(szHtmlPath, true)) { sw.WriteLine(pageHtml); } } switch(htmlEngin) { case HTMLEnginType.HTMLEngin_mshtml: PhaseHtml_mshtml(pageHtml, szResultPath, szErrorPath, result); break; case HTMLEnginType.HTMLEngin_HtmlAgilityPack: PhaseHtml_HtmlAgilityPack(pageHtml, szResultPath, szErrorPath, result); break; } } catch (WebException webEx) { using (StreamWriter sw = new StreamWriter(szErrorPath, true)) { sw.WriteLine(webEx.Message); } result.bSuccess = false; } return result; } private void PhaseHtml_mshtml(String pageHtml, String szResultPath, String szErrorPath, PhaseResultBean result) { mshtml.HTMLDocument docObject = new mshtml.HTMLDocument(); mshtml.IHTMLDocument2 doc2 = docObject as mshtml.IHTMLDocument2; doc2.write(pageHtml); doc2.close(); mshtml.IHTMLDocument3 doc3 = docObject as mshtml.IHTMLDocument3; int len = doc3.getElementById("shop-all-list").children[0].children.length; result.total += len; foreach (IHTMLElement li in doc3.getElementById("shop-all-list").children[0].children) { try { IHTMLElement title = li.children[1].children[0]; String szTitle = title.innerText; if (szTitle != null) szTitle = szTitle.Replace("\r\n", "-"); IHTMLElement star = li.children[1].children[1].children[0]; String szStar = star.getAttribute("title"); IHTMLElement reviewNum = li.children[1].children[1].children[1]; String szReviewNum = reviewNum.innerText; IHTMLElement meanPrice = li.children[1].children[1].children[3]; String szMeanPrice = meanPrice.innerText; IHTMLElement category = li.children[1].children[2].children[0]; String szCategory = category.innerText; IHTMLElement address = li.children[1].children[2].children[3]; String szAddress = address.innerText; if (szAddress != null) szAddress.Replace(",", "-"); IHTMLElement taste = li.children[1].children[3].children[0]; String szTaste = taste.innerText; IHTMLElement evn = li.children[1].children[3].children[1]; String szEvn = evn.innerText; IHTMLElement service = li.children[1].children[3].children[2]; String szService = service.innerText; //将获取的内容写入文本 using (StreamWriter sw = new StreamWriter(szResultPath, true)) { sw.WriteLine(szTitle + "," + szStar + "," + szReviewNum + "," + szMeanPrice + "," + szCategory + "," + szAddress + "," + szTaste + "," + szEvn + "," + szService); } } catch (Exception Ex) { using (StreamWriter sw = new StreamWriter(szErrorPath, true)) { sw.WriteLine(Ex.Message); } result.failed += 1; } } } private void PhaseHtml_HtmlAgilityPack(String pageHtml, String szResultPath, String szErrorPath, PhaseResultBean result) { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(pageHtml); HtmlAgilityPack.HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("/html[1]/body[1]/div[4]/div[3]/div[1]/div[1]/div[2]/ul[1]/li"); result.total += nodes.Count; foreach (HtmlAgilityPack.HtmlNode li in nodes) { try { HtmlAgilityPack.HtmlNode titleA = li.SelectNodes("div[2]/div[1]/a[1]")[0]; HtmlAgilityPack.HtmlNode titleB = li.SelectNodes("div[2]/div[1]/a[2]") == null ? null : li.SelectNodes("div[2]/div[1]/a[2]")[0]; String szTitle = (titleA==null?"":titleA.InnerText) + "-" + (titleB == null ? "" : titleB.InnerText); if (szTitle != null) szTitle = szTitle.Replace("\n", ""); if (szTitle != null) szTitle = szTitle.Replace(" ", ""); HtmlAgilityPack.HtmlNode star = li.SelectNodes("div[2]/div[2]/span[1]")[0]; String szStar = star.Attributes["title"].Value.ToString(); HtmlAgilityPack.HtmlNode reviewNum = li.SelectNodes("div[2]/div[2]/a[1]")[0]; String szReviewNum = reviewNum.InnerText; if (szReviewNum != null) szReviewNum = szReviewNum.Replace("\n", ""); if (szReviewNum != null) szReviewNum = szReviewNum.Replace(" ", ""); HtmlAgilityPack.HtmlNode meanPrice = li.SelectNodes("div[2]/div[2]/a[2]")[0]; String szMeanPrice = meanPrice.InnerText; if (szMeanPrice != null) szMeanPrice = szMeanPrice.Replace("\n", ""); if (szMeanPrice != null) szMeanPrice = szMeanPrice.Replace(" ", ""); HtmlAgilityPack.HtmlNode category = li.SelectNodes("div[2]/div[3]/a[1]")[0]; String szCategory = category.InnerText; HtmlAgilityPack.HtmlNode addressA = li.SelectNodes("div[2]/div[3]/a[2]")[0]; HtmlAgilityPack.HtmlNode addressB = li.SelectNodes("div[2]/div[3]/span[1]")[0]; String szAddress = addressA.InnerText + "-" + addressB.InnerText; if (szAddress != null) szAddress.Replace(",", "-"); HtmlAgilityPack.HtmlNode taste = li.SelectNodes("div[2]/span[1]/span[1]")[0]; String szTaste = taste.InnerText; HtmlAgilityPack.HtmlNode evn = li.SelectNodes("div[2]/span[1]/span[2]")[0]; String szEvn = evn.InnerText; HtmlAgilityPack.HtmlNode service = li.SelectNodes("div[2]/span[1]/span[3]")[0]; String szService = service.InnerText; //将获取的内容写入文本 using (StreamWriter sw = new StreamWriter(szResultPath, true)) { sw.WriteLine(szTitle + "," + szStar + "," + szReviewNum + "," + szMeanPrice + "," + szCategory + "," + szAddress + "," + szTaste + "," + szEvn + "," + szService); } } catch (Exception Ex) { using (StreamWriter sw = new StreamWriter(szErrorPath, true)) { sw.WriteLine(Ex.Message); } result.failed += 1; } } } } class PhaseResultBean { public Boolean bSuccess; public int total; public int successed; public int failed; } public enum HTMLEnginType { HTMLEngin_mshtml, HTMLEngin_HtmlAgilityPack }
Python抓取网页内容
1、BeautifulSoup解析网页
''' Created on 20150203 @author: Hansen ''' import urllib2 import sys import io from bs4 import BeautifulSoup #Fetch HTML from URL def fecth_html(index,url,keepHtml,resultFile): req = urllib2.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0') rsp = urllib2.urlopen(req) content = rsp.read() #receive_header = rsp.info() #print(sys.getfilesystemencoding()) #content = content.decode('utf-8','replace') if keepHtml: fileinfo = open(str(index)+'.html','w') fileinfo.write(content) print("save file "+ str(index)+'.html: ok') parse_html(content,resultFile) #Parse HTML def parse_html(html,resultFile): soup = BeautifulSoup(html,fromEncoding="utf8") lis = soup.select('div.shop-all-list li') print(len(lis)) for li in lis: szTitle = (li.select('div:nth-of-type(2) div:nth-of-type(1) a h4'))[0].get_text() szTitle = szTitle.replace("\r\n", "-").replace(" ",""); szStar = (li.select('div:nth-of-type(2) div:nth-of-type(3) span'))[0]['title'] szReviewNum = (li.select('div:nth-of-type(2) div:nth-of-type(3) a:nth-of-type(1)'))[0].get_text() szReviewNum = szReviewNum.replace("\n", "").replace(" ",""); szMeanPrice = (li.select('div:nth-of-type(2) div:nth-of-type(3) a:nth-of-type(2)'))[0].get_text() szMeanPrice = szMeanPrice.replace("\n", "").replace(" ",""); szCategory = (li.select('div:nth-of-type(2) div:nth-of-type(4) a:nth-of-type(1)'))[0].get_text() szAddressA = (li.select('div:nth-of-type(2) div:nth-of-type(4) a:nth-of-type(2)'))[0].get_text() szAddressB = (li.select('div:nth-of-type(2) div:nth-of-type(4) span:nth-of-type(3)'))[0].get_text() szAddress = (szAddressA+"-"+szAddressB).replace("\r\n", "-").replace(" ",""); szTaste = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(1)'))[0].get_text() szEvn = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(2)'))[0].get_text() szService = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(3)'))[0].get_text() fileinfo = io.open(resultFile,'a',encoding='utf_16') fileinfo.write(szTitle+","+szStar+","+szReviewNum+","+szMeanPrice+","+szCategory+"," +szAddress+","+szTaste+","+szEvn+","+szService+"\n")
2、PyQuery解析网页
''' Created on 20150203 @author: Hansen ''' import urllib2 import sys import io from pyquery import PyQuery #Fetch HTML from URL def fecth_html(index,url,keepHtml,resultFile): req = urllib2.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0') rsp = urllib2.urlopen(req) content = rsp.read() #receive_header = rsp.info() #print(sys.getfilesystemencoding()) #content = content.decode('utf-8','replace') if keepHtml: fileinfo = open(str(index)+'.html','w') fileinfo.write(content) print("save file "+ str(index)+'.html: ok') parse_html(content,resultFile) #Parse HTML def parse_html(html,resultFile): doc = PyQuery(html) lis = doc('div.shop-all-list li') print(len(lis)) for li in lis: li_doc = PyQuery(li) szTitle = li_doc('li div div a h4').text() szTitle = szTitle.replace("\r\n", "-").replace(" ",""); szStar = li_doc("li div div span").filter('.sml-rank-stars').attr('title') szReviewNum = li_doc('li div div a').filter('.review-num').text() szReviewNum = szReviewNum.replace("\n", "").replace(" ",""); szMeanPrice = li_doc('li div div a').filter('.mean-price').text() szMeanPrice = szMeanPrice.replace("\n", "").replace(" ",""); szCategory = li_doc('li div div a span').filter('.tag').eq(1).text() szAddressA = li_doc('li div div a span').filter('.tag').eq(1).text() szAddressB = li_doc('li div div span').filter('.addr').eq(0).text() szAddress = (szAddressA+"-"+szAddressB).replace("\r\n", "-").replace(" ",""); szTaste = li_doc('li div span span').eq(0).text() szEvn = li_doc('li div span span').eq(1).text() szService = li_doc('li div span span').eq(2).text() fileinfo = io.open(resultFile,'a',encoding='utf_16') fileinfo.write(szTitle+","+szStar+","+szReviewNum+","+szMeanPrice+","+szCategory+"," +szAddress+","+szTaste+","+szEvn+","+szService+"\n")