# -*- coding: UTF-8 -*-
'''
Created on 20150206
@author: Hansen
'''
import urllib2
import sys
import io
import json
#Fetch HTML from URL
def fecth_html(index,url,keepHtml,resultFile):
req = urllib2.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0')
rsp = urllib2.urlopen(req)
content = rsp.read()
#receive_header = rsp.info()
#print(sys.getfilesystemencoding())
#content = content.decode('utf-8','replace')
if keepHtml:
fileinfo = open(str(index)+'.html','w')
fileinfo.write(content)
print("save file "+ str(index)+'.html: ok')
parse_json(content,resultFile)
#Parse HTML
def parse_json(content,resultFile):
jsonData = json.loads(content)
shops = jsonData['shopBeans']
print(len(shops))
for shop in shops:
szTitle = shop['filterFullName']
szTitle = szTitle.replace("\r\n", "-").replace(" ","");
szStar = shop['shopPowerTitle']
szMeanPrice = str(shop['avgPrice'])
szMeanPrice = szMeanPrice.replace("\n", "").replace(" ","");
szAddressA = shop['mainRegionName']
szAddressB = shop['address']
szAddress = (szAddressA+"-"+szAddressB).replace("\r\n", "-").replace(" ","");
szTaste = shop['refinedScore1']
szEvn = shop['refinedScore2']
szService = shop['refinedScore3']
fileinfo = io.open(resultFile,'a',encoding='utf_16')
fileinfo.write(szTitle+","+szStar+","+szMeanPrice+","+szAddress+"," +szTaste+","+szEvn+","+szService+"\n")
Tag Archives: Crawler
CSharp抓取JSON网页内容
using Newtonsoft.Json;
using DaZhongDianPing.JsonBeans;
class JsonCrawler
{
private PhaseResultBean PhaseJson(Uri uri, String szResultPath, String szErrorPath)
{
PhaseResultBean result = new PhaseResultBean();
try
{
//取回网页
WebClient client = new WebClient();
client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
Byte[] pageData = client.DownloadData(uri);
string pageHtml = Encoding.UTF8.GetString(pageData);
JsonBeans.TopShopsBean topShops = JsonConvert.DeserializeObject<JsonBeans.TopShopsBean>(pageHtml);
//分析Json
int len = topShops.ShopBeans.Length;
result.total = len;
foreach (ShopBean shop in topShops.ShopBeans)
{
try
{
String szTitle = shop.FilterFullName;
if (szTitle != null) szTitle = szTitle.Replace("\r\n", "-");
String szStar = shop.ShopPowerTitle;
String szMeanPrice = shop.AvgPrice.ToString();
String szRegionName = shop.MainRegionName;
String szAddress = shop.Address;
if (szAddress != null) szAddress.Replace(",", "-");
String szTaste = shop.RefinedScore1;
String szEvn = shop.RefinedScore2;
String szService = shop.RefinedScore3;
//将获取的内容写入文本
using (StreamWriter sw = new StreamWriter(szResultPath, true))
{
sw.WriteLine(szTitle + "," + szStar + "," + szMeanPrice + "," + szRegionName + "," + szAddress + "," + szTaste + "," + szEvn + "," + szService);
}
result.successed += 1;
}
catch (Exception Ex)
{
using (StreamWriter sw = new StreamWriter(szErrorPath, true))
{
sw.WriteLine(Ex.Message);
}
result.failed += 1;
}
}
}
catch (WebException webEx)
{
using (StreamWriter sw = new StreamWriter(szErrorPath, true))
{
sw.WriteLine(webEx.Message);
}
result.bSuccess = false;
}
return result;
}
}
class PhaseResultBean
{
public Boolean bSuccess;
public int total;
public int successed;
public int failed;
}
public enum JsonEnginType
{
JsonEngin_Newtonsoft
}
internal class ShopBean
{
[JsonProperty("addDate")]
public string AddDate { get; set; }
[JsonProperty("addUser")]
public object AddUser { get; set; }
[JsonProperty("addUserName")]
public object AddUserName { get; set; }
[JsonProperty("address")]
public string Address { get; set; }
[JsonProperty("altName")]
public string AltName { get; set; }
[JsonProperty("avgPrice")]
public int AvgPrice { get; set; }
[JsonProperty("branchName")]
public string BranchName { get; set; }
[JsonProperty("branchTotal")]
public int BranchTotal { get; set; }
[JsonProperty("businessHours")]
public string BusinessHours { get; set; }
[JsonProperty("canSendSms")]
public object CanSendSms { get; set; }
[JsonProperty("categoryId")]
public int CategoryId { get; set; }
[JsonProperty("cityId")]
public int CityId { get; set; }
[JsonProperty("crossRoad")]
public string CrossRoad { get; set; }
[JsonProperty("defaultPic")]
public string DefaultPic { get; set; }
[JsonProperty("defaultPicBig")]
public object DefaultPicBig { get; set; }
[JsonProperty("dishTagList")]
public string[][] DishTagList { get; set; }
[JsonProperty("dishTags")]
public string DishTags { get; set; }
[JsonProperty("district")]
public int District { get; set; }
[JsonProperty("districtName")]
public object DistrictName { get; set; }
[JsonProperty("filterFullAdress")]
public string FilterFullAdress { get; set; }
[JsonProperty("filterFullName")]
public string FilterFullName { get; set; }
[JsonProperty("firstReviewId")]
public int FirstReviewId { get; set; }
[JsonProperty("firstUserFace")]
public object FirstUserFace { get; set; }
[JsonProperty("firstUserNickName")]
public object FirstUserNickName { get; set; }
[JsonProperty("fullAdress")]
public string FullAdress { get; set; }
[JsonProperty("fullName")]
public string FullName { get; set; }
[JsonProperty("glat")]
public object Glat { get; set; }
[JsonProperty("glng")]
public object Glng { get; set; }
[JsonProperty("groupFlag")]
public object GroupFlag { get; set; }
[JsonProperty("hasStaticMap")]
public object HasStaticMap { get; set; }
[JsonProperty("hits")]
public int Hits { get; set; }
[JsonProperty("isUserCanUpdate")]
public object IsUserCanUpdate { get; set; }
[JsonProperty("lastDate")]
public string LastDate { get; set; }
[JsonProperty("lastIp")]
public object LastIp { get; set; }
[JsonProperty("lastUser")]
public object LastUser { get; set; }
[JsonProperty("lastUserName")]
public object LastUserName { get; set; }
[JsonProperty("mainCategoryId")]
public int MainCategoryId { get; set; }
[JsonProperty("mainCategoryName")]
public object MainCategoryName { get; set; }
[JsonProperty("mainRegionId")]
public int MainRegionId { get; set; }
[JsonProperty("mainRegionName")]
public string MainRegionName { get; set; }
[JsonProperty("minUserMana")]
public object MinUserMana { get; set; }
[JsonProperty("monthlyHits")]
public int MonthlyHits { get; set; }
[JsonProperty("nearByTags")]
public object NearByTags { get; set; }
[JsonProperty("nearbyShops")]
public object NearbyShops { get; set; }
[JsonProperty("oldChainId")]
public object OldChainId { get; set; }
[JsonProperty("phoneNo")]
public string PhoneNo { get; set; }
[JsonProperty("phoneNo2")]
public string PhoneNo2 { get; set; }
[JsonProperty("picTotal")]
public int PicTotal { get; set; }
[JsonProperty("popularity")]
public int Popularity { get; set; }
[JsonProperty("power")]
public int Power { get; set; }
[JsonProperty("prevWeeklyHits")]
public object PrevWeeklyHits { get; set; }
[JsonProperty("priceInfo")]
public object PriceInfo { get; set; }
[JsonProperty("priceLevel")]
public int PriceLevel { get; set; }
[JsonProperty("primaryTag")]
public string PrimaryTag { get; set; }
[JsonProperty("promoId")]
public int PromoId { get; set; }
[JsonProperty("publicTransit")]
public string PublicTransit { get; set; }
[JsonProperty("refinedScore1")]
public string RefinedScore1 { get; set; }
[JsonProperty("refinedScore2")]
public string RefinedScore2 { get; set; }
[JsonProperty("refinedScore3")]
public string RefinedScore3 { get; set; }
[JsonProperty("regionId")]
public int RegionId { get; set; }
[JsonProperty("score")]
public int Score { get; set; }
[JsonProperty("score1")]
public int Score1 { get; set; }
[JsonProperty("score2")]
public int Score2 { get; set; }
[JsonProperty("score3")]
public int Score3 { get; set; }
[JsonProperty("score4")]
public int Score4 { get; set; }
[JsonProperty("searchKeyWord")]
public object SearchKeyWord { get; set; }
[JsonProperty("searchName")]
public object SearchName { get; set; }
[JsonProperty("shopGroupId")]
public int ShopGroupId { get; set; }
[JsonProperty("shopId")]
public int ShopId { get; set; }
[JsonProperty("shopName")]
public string ShopName { get; set; }
[JsonProperty("shopPower")]
public int ShopPower { get; set; }
[JsonProperty("shopPowerTitle")]
public string ShopPowerTitle { get; set; }
[JsonProperty("shopTagList")]
public string[][] ShopTagList { get; set; }
[JsonProperty("shopTags")]
public string ShopTags { get; set; }
[JsonProperty("shopTotalName")]
public string ShopTotalName { get; set; }
[JsonProperty("shopType")]
public int ShopType { get; set; }
[JsonProperty("similarShops")]
public object SimilarShops { get; set; }
[JsonProperty("suggestGA")]
public object SuggestGA { get; set; }
[JsonProperty("suggestReason")]
public object SuggestReason { get; set; }
[JsonProperty("todayHits")]
public object TodayHits { get; set; }
[JsonProperty("voteTotal")]
public int VoteTotal { get; set; }
[JsonProperty("webSite")]
public object WebSite { get; set; }
[JsonProperty("weeklyHits")]
public int WeeklyHits { get; set; }
[JsonProperty("wishTotal")]
public object WishTotal { get; set; }
[JsonProperty("writeUp")]
public string WriteUp { get; set; }
}
internal class TopShopsBean
{
[JsonProperty("categoryId")]
public int CategoryId { get; set; }
[JsonProperty("cityId")]
public int CityId { get; set; }
[JsonProperty("maxResults")]
public int MaxResults { get; set; }
[JsonProperty("rankType")]
public int RankType { get; set; }
[JsonProperty("shopBeans")]
public ShopBean[] ShopBeans { get; set; }
[JsonProperty("shopType")]
public int ShopType { get; set; }
[JsonProperty("skipResults")]
public int SkipResults { get; set; }
}
CSharp抓取HTML网页内容
using mshtml;
using HtmlAgilityPack;
class HTMLCrawler
{
private PhaseResultBean PhaseHtml(int index, Uri uri, String szResultPath, String szErrorPath, HTMLEnginType htmlEngin)
{
PhaseResultBean result = new PhaseResultBean();
try
{
WebClient client = new WebClient();
client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
Byte[] pageData = client.DownloadData(uri);
string pageHtml = Encoding.UTF8.GetString(pageData);
if (checkSavePages.Checked)
{
String szHtmlPath = XWin32.getExeParentPath() + index.ToString()+".html";
using (StreamWriter sw = new StreamWriter(szHtmlPath, true))
{
sw.WriteLine(pageHtml);
}
}
switch(htmlEngin)
{
case HTMLEnginType.HTMLEngin_mshtml:
PhaseHtml_mshtml(pageHtml, szResultPath, szErrorPath, result);
break;
case HTMLEnginType.HTMLEngin_HtmlAgilityPack:
PhaseHtml_HtmlAgilityPack(pageHtml, szResultPath, szErrorPath, result);
break;
}
}
catch (WebException webEx)
{
using (StreamWriter sw = new StreamWriter(szErrorPath, true))
{
sw.WriteLine(webEx.Message);
}
result.bSuccess = false;
}
return result;
}
private void PhaseHtml_mshtml(String pageHtml, String szResultPath, String szErrorPath, PhaseResultBean result)
{
mshtml.HTMLDocument docObject = new mshtml.HTMLDocument();
mshtml.IHTMLDocument2 doc2 = docObject as mshtml.IHTMLDocument2;
doc2.write(pageHtml);
doc2.close();
mshtml.IHTMLDocument3 doc3 = docObject as mshtml.IHTMLDocument3;
int len = doc3.getElementById("shop-all-list").children[0].children.length;
result.total += len;
foreach (IHTMLElement li in doc3.getElementById("shop-all-list").children[0].children)
{
try
{
IHTMLElement title = li.children[1].children[0];
String szTitle = title.innerText;
if (szTitle != null) szTitle = szTitle.Replace("\r\n", "-");
IHTMLElement star = li.children[1].children[1].children[0];
String szStar = star.getAttribute("title");
IHTMLElement reviewNum = li.children[1].children[1].children[1];
String szReviewNum = reviewNum.innerText;
IHTMLElement meanPrice = li.children[1].children[1].children[3];
String szMeanPrice = meanPrice.innerText;
IHTMLElement category = li.children[1].children[2].children[0];
String szCategory = category.innerText;
IHTMLElement address = li.children[1].children[2].children[3];
String szAddress = address.innerText;
if (szAddress != null) szAddress.Replace(",", "-");
IHTMLElement taste = li.children[1].children[3].children[0];
String szTaste = taste.innerText;
IHTMLElement evn = li.children[1].children[3].children[1];
String szEvn = evn.innerText;
IHTMLElement service = li.children[1].children[3].children[2];
String szService = service.innerText;
//将获取的内容写入文本
using (StreamWriter sw = new StreamWriter(szResultPath, true))
{
sw.WriteLine(szTitle + "," + szStar + "," + szReviewNum + "," + szMeanPrice + "," + szCategory + "," + szAddress + "," + szTaste + "," + szEvn + "," + szService);
}
}
catch (Exception Ex)
{
using (StreamWriter sw = new StreamWriter(szErrorPath, true))
{
sw.WriteLine(Ex.Message);
}
result.failed += 1;
}
}
}
private void PhaseHtml_HtmlAgilityPack(String pageHtml, String szResultPath, String szErrorPath, PhaseResultBean result)
{
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(pageHtml);
HtmlAgilityPack.HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("/html[1]/body[1]/div[4]/div[3]/div[1]/div[1]/div[2]/ul[1]/li");
result.total += nodes.Count;
foreach (HtmlAgilityPack.HtmlNode li in nodes)
{
try
{
HtmlAgilityPack.HtmlNode titleA = li.SelectNodes("div[2]/div[1]/a[1]")[0];
HtmlAgilityPack.HtmlNode titleB = li.SelectNodes("div[2]/div[1]/a[2]") == null ? null : li.SelectNodes("div[2]/div[1]/a[2]")[0];
String szTitle = (titleA==null?"":titleA.InnerText) + "-" + (titleB == null ? "" : titleB.InnerText);
if (szTitle != null) szTitle = szTitle.Replace("\n", "");
if (szTitle != null) szTitle = szTitle.Replace(" ", "");
HtmlAgilityPack.HtmlNode star = li.SelectNodes("div[2]/div[2]/span[1]")[0];
String szStar = star.Attributes["title"].Value.ToString();
HtmlAgilityPack.HtmlNode reviewNum = li.SelectNodes("div[2]/div[2]/a[1]")[0];
String szReviewNum = reviewNum.InnerText;
if (szReviewNum != null) szReviewNum = szReviewNum.Replace("\n", "");
if (szReviewNum != null) szReviewNum = szReviewNum.Replace(" ", "");
HtmlAgilityPack.HtmlNode meanPrice = li.SelectNodes("div[2]/div[2]/a[2]")[0];
String szMeanPrice = meanPrice.InnerText;
if (szMeanPrice != null) szMeanPrice = szMeanPrice.Replace("\n", "");
if (szMeanPrice != null) szMeanPrice = szMeanPrice.Replace(" ", "");
HtmlAgilityPack.HtmlNode category = li.SelectNodes("div[2]/div[3]/a[1]")[0];
String szCategory = category.InnerText;
HtmlAgilityPack.HtmlNode addressA = li.SelectNodes("div[2]/div[3]/a[2]")[0];
HtmlAgilityPack.HtmlNode addressB = li.SelectNodes("div[2]/div[3]/span[1]")[0];
String szAddress = addressA.InnerText + "-" + addressB.InnerText;
if (szAddress != null) szAddress.Replace(",", "-");
HtmlAgilityPack.HtmlNode taste = li.SelectNodes("div[2]/span[1]/span[1]")[0];
String szTaste = taste.InnerText;
HtmlAgilityPack.HtmlNode evn = li.SelectNodes("div[2]/span[1]/span[2]")[0];
String szEvn = evn.InnerText;
HtmlAgilityPack.HtmlNode service = li.SelectNodes("div[2]/span[1]/span[3]")[0];
String szService = service.InnerText;
//将获取的内容写入文本
using (StreamWriter sw = new StreamWriter(szResultPath, true))
{
sw.WriteLine(szTitle + "," + szStar + "," + szReviewNum + "," + szMeanPrice + "," + szCategory + "," + szAddress + "," + szTaste + "," + szEvn + "," + szService);
}
}
catch (Exception Ex)
{
using (StreamWriter sw = new StreamWriter(szErrorPath, true))
{
sw.WriteLine(Ex.Message);
}
result.failed += 1;
}
}
}
}
class PhaseResultBean
{
public Boolean bSuccess;
public int total;
public int successed;
public int failed;
}
public enum HTMLEnginType
{
HTMLEngin_mshtml,
HTMLEngin_HtmlAgilityPack
}
Python抓取网页内容
1、BeautifulSoup解析网页
'''
Created on 20150203
@author: Hansen
'''
import urllib2
import sys
import io
from bs4 import BeautifulSoup
#Fetch HTML from URL
def fecth_html(index,url,keepHtml,resultFile):
req = urllib2.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0')
rsp = urllib2.urlopen(req)
content = rsp.read()
#receive_header = rsp.info()
#print(sys.getfilesystemencoding())
#content = content.decode('utf-8','replace')
if keepHtml:
fileinfo = open(str(index)+'.html','w')
fileinfo.write(content)
print("save file "+ str(index)+'.html: ok')
parse_html(content,resultFile)
#Parse HTML
def parse_html(html,resultFile):
soup = BeautifulSoup(html,fromEncoding="utf8")
lis = soup.select('div.shop-all-list li')
print(len(lis))
for li in lis:
szTitle = (li.select('div:nth-of-type(2) div:nth-of-type(1) a h4'))[0].get_text()
szTitle = szTitle.replace("\r\n", "-").replace(" ","");
szStar = (li.select('div:nth-of-type(2) div:nth-of-type(3) span'))[0]['title']
szReviewNum = (li.select('div:nth-of-type(2) div:nth-of-type(3) a:nth-of-type(1)'))[0].get_text()
szReviewNum = szReviewNum.replace("\n", "").replace(" ","");
szMeanPrice = (li.select('div:nth-of-type(2) div:nth-of-type(3) a:nth-of-type(2)'))[0].get_text()
szMeanPrice = szMeanPrice.replace("\n", "").replace(" ","");
szCategory = (li.select('div:nth-of-type(2) div:nth-of-type(4) a:nth-of-type(1)'))[0].get_text()
szAddressA = (li.select('div:nth-of-type(2) div:nth-of-type(4) a:nth-of-type(2)'))[0].get_text()
szAddressB = (li.select('div:nth-of-type(2) div:nth-of-type(4) span:nth-of-type(3)'))[0].get_text()
szAddress = (szAddressA+"-"+szAddressB).replace("\r\n", "-").replace(" ","");
szTaste = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(1)'))[0].get_text()
szEvn = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(2)'))[0].get_text()
szService = (li.select('div:nth-of-type(2) span:nth-of-type(5) span:nth-of-type(3)'))[0].get_text()
fileinfo = io.open(resultFile,'a',encoding='utf_16')
fileinfo.write(szTitle+","+szStar+","+szReviewNum+","+szMeanPrice+","+szCategory+"," +szAddress+","+szTaste+","+szEvn+","+szService+"\n")
2、PyQuery解析网页
'''
Created on 20150203
@author: Hansen
'''
import urllib2
import sys
import io
from pyquery import PyQuery
#Fetch HTML from URL
def fecth_html(index,url,keepHtml,resultFile):
req = urllib2.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0')
rsp = urllib2.urlopen(req)
content = rsp.read()
#receive_header = rsp.info()
#print(sys.getfilesystemencoding())
#content = content.decode('utf-8','replace')
if keepHtml:
fileinfo = open(str(index)+'.html','w')
fileinfo.write(content)
print("save file "+ str(index)+'.html: ok')
parse_html(content,resultFile)
#Parse HTML
def parse_html(html,resultFile):
doc = PyQuery(html)
lis = doc('div.shop-all-list li')
print(len(lis))
for li in lis:
li_doc = PyQuery(li)
szTitle = li_doc('li div div a h4').text()
szTitle = szTitle.replace("\r\n", "-").replace(" ","");
szStar = li_doc("li div div span").filter('.sml-rank-stars').attr('title')
szReviewNum = li_doc('li div div a').filter('.review-num').text()
szReviewNum = szReviewNum.replace("\n", "").replace(" ","");
szMeanPrice = li_doc('li div div a').filter('.mean-price').text()
szMeanPrice = szMeanPrice.replace("\n", "").replace(" ","");
szCategory = li_doc('li div div a span').filter('.tag').eq(1).text()
szAddressA = li_doc('li div div a span').filter('.tag').eq(1).text()
szAddressB = li_doc('li div div span').filter('.addr').eq(0).text()
szAddress = (szAddressA+"-"+szAddressB).replace("\r\n", "-").replace(" ","");
szTaste = li_doc('li div span span').eq(0).text()
szEvn = li_doc('li div span span').eq(1).text()
szService = li_doc('li div span span').eq(2).text()
fileinfo = io.open(resultFile,'a',encoding='utf_16')
fileinfo.write(szTitle+","+szStar+","+szReviewNum+","+szMeanPrice+","+szCategory+"," +szAddress+","+szTaste+","+szEvn+","+szService+"\n")