Python3 中怎么解析html,针对这个问题,这篇文章详细介绍了相对应的分析和解答,希望可以帮助更多想解决这个问题的小伙伴找到更简单易行的方法。
辅助函数,主要用于获取html并输入解析后的结束
#把传递解析函数,便于下面的修改
def get_html(url, paraser=bs4_paraser):
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Host': 'www.360kan.com',
'Proxy-Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
response.encoding = 'utf-8'
if response.code == 200:
data = StringIO.StringIO(response.read())
gzipper = gzip.GzipFile(fileobj=data)
data = gzipper.read()
value = paraser(data) # open('E:/h6/haPkY0osd0r5UB.html').read()
return value
else:
pass
value = get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html', paraser=lxml_parser)
for row in value:
print row
1,lxml.html的方式进行解析,
The lxml XML toolkit is a Pythonic binding for the C libraries libxml2 and libxslt. It is unique in that it combines the speed and XML feature completeness of these libraries with the simplicity of a native Python API, mostly compatible but superior to the well-known ElementTree API. The latest release works with all CPython versions from 2.6 to 3.5. See the introduction for more information about background and goals of the lxml project. Some common questions are answered in the FAQ. [官网](http://lxml.de/)
def lxml_parser(page):
data = []
doc = etree.HTML(page)
all_div = doc.xpath('//div[@class="yingping-list-wrap"]')
for row in all_div:
# 获取每一个影评,即影评的item
all_div_item = row.xpath('.//div[@class="item"]') # find_all('div', attrs={'class': 'item'})
for r in all_div_item:
value = {}
# 获取影评的标题部分
title = r.xpath('.//div[@class="g-clear title-wrap"][1]')
value['title'] = title[0].xpath('./a/text()')[0]
value['title_href'] = title[0].xpath('./a/@href')[0]
score_text = title[0].xpath('./div/span/span/@style')[0]
score_text = re.search(r'\d+', score_text).group()
value['score'] = int(score_text) / 20
# 时间
value['time'] = title[0].xpath('./div/span[@class="time"]/text()')[0]
# 多少人喜欢
value['people'] = int(
re.search(r'\d+', title[0].xpath('./div[@class="num"]/span/text()')[0]).group())
data.append(value)
return data
2,使用BeautifulSoup,不多说了,大家网上找资料看看
def bs4_paraser(html):
all_value = []
value = {}
soup = BeautifulSoup(html, 'html.parser')
# 获取影评的部分
all_div = soup.find_all('div', attrs={'class': 'yingping-list-wrap'}, limit=1)
for row in all_div:
# 获取每一个影评,即影评的item
all_div_item = row.find_all('div', attrs={'class': 'item'})
for r in all_div_item:
# 获取影评的标题部分
title = r.find_all('div', attrs={'class': 'g-clear title-wrap'}, limit=1)
if title is not None and len(title) > 0:
value['title'] = title[0].a.string
value['title_href'] = title[0].a['href']
score_text = title[0].div.span.span['style']
score_text = re.search(r'\d+', score_text).group()
value['score'] = int(score_text) / 20
# 时间
value['time'] = title[0].div.find_all('span', attrs={'class': 'time'})[0].string
# 多少人喜欢
value['people'] = int(
re.search(r'\d+', title[0].find_all('div', attrs={'class': 'num'})[0].span.string).group())
# print r
all_value.append(value)
value = {}
return all_value
3,使用SGMLParser,主要是通过start、end tag的方式进行了,解析工程比较明朗,但是有点麻烦,而且该案例的场景不太适合该方法,(哈哈)
class CommentParaser(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.__start_div_yingping = False
self.__start_div_item = False
self.__start_div_gclear = False
self.__start_div_ratingwrap = False
self.__start_div_num = False
# a
self.__start_a = False
# span 3中状态
self.__span_state = 0
# 数据
self.__value = {}
self.data = []
def start_div(self, attrs):
for k, v in attrs:
if k == 'class' and v == 'yingping-list-wrap':
self.__start_div_yingping = True
elif k == 'class' and v == 'item':
self.__start_div_item = True
elif k == 'class' and v == 'g-clear title-wrap':
self.__start_div_gclear = True
elif k == 'class' and v == 'rating-wrap g-clear':
self.__start_div_ratingwrap = True
elif k == 'class' and v == 'num':
self.__start_div_num = True
def end_div(self):
if self.__start_div_yingping:
if self.__start_div_item:
if self.__start_div_gclear:
if self.__start_div_num or self.__start_div_ratingwrap:
if self.__start_div_num:
self.__start_div_num = False
if self.__start_div_ratingwrap:
self.__start_div_ratingwrap = False
else:
self.__start_div_gclear = False
else:
self.data.append(self.__value)
self.__value = {}
self.__start_div_item = False
else:
self.__start_div_yingping = False
def start_a(self, attrs):
if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
self.__start_a = True
for k, v in attrs:
if k == 'href':
self.__value['href'] = v
def end_a(self):
if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:
self.__start_a = False
def start_span(self, attrs):
if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
if self.__start_div_ratingwrap:
if self.__span_state != 1:
for k, v in attrs:
if k == 'class' and v == 'rating':
self.__span_state = 1
elif k == 'class' and v == 'time':
self.__span_state = 2
else:
for k, v in attrs:
if k == 'style':
score_text = re.search(r'\d+', v).group()
self.__value['score'] = int(score_text) / 20
self.__span_state = 3
elif self.__start_div_num:
self.__span_state = 4
def end_span(self):
self.__span_state = 0
def handle_data(self, data):
if self.__start_a:
self.__value['title'] = data
elif self.__span_state == 2:
self.__value['time'] = data
elif self.__span_state == 4:
score_text = re.search(r'\d+', data).group()
self.__value['people'] = int(score_text)
pass
def sgl_parser(html):
parser = CommentParaser()
parser.feed(html)
return parser.data
4,HTMLParaer,与3原理相识,就是调用的方法不太一样,基本上可以公用,
class CommentHTMLParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.__start_div_yingping = False
self.__start_div_item = False
self.__start_div_gclear = False
self.__start_div_ratingwrap = False
self.__start_div_num = False
# a
self.__start_a = False
# span 3中状态
self.__span_state = 0
# 数据
self.__value = {}
self.data = []
def handle_starttag(self, tag, attrs):
if tag == 'div':
for k, v in attrs:
if k == 'class' and v == 'yingping-list-wrap':
self.__start_div_yingping = True
elif k == 'class' and v == 'item':
self.__start_div_item = True
elif k == 'class' and v == 'g-clear title-wrap':
self.__start_div_gclear = True
elif k == 'class' and v == 'rating-wrap g-clear':
self.__start_div_ratingwrap = True
elif k == 'class' and v == 'num':
self.__start_div_num = True
elif tag == 'a':
if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
self.__start_a = True
for k, v in attrs:
if k == 'href':
self.__value['href'] = v
elif tag == 'span':
if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
if self.__start_div_ratingwrap:
if self.__span_state != 1:
for k, v in attrs:
if k == 'class' and v == 'rating':
self.__span_state = 1
elif k == 'class' and v == 'time':
self.__span_state = 2
else:
for k, v in attrs:
if k == 'style':
score_text = re.search(r'\d+', v).group()
self.__value['score'] = int(score_text) / 20
self.__span_state = 3
elif self.__start_div_num:
self.__span_state = 4
def handle_endtag(self, tag):
if tag == 'div':
if self.__start_div_yingping:
if self.__start_div_item:
if self.__start_div_gclear:
if self.__start_div_num or self.__start_div_ratingwrap:
if self.__start_div_num:
self.__start_div_num = False
if self.__start_div_ratingwrap:
self.__start_div_ratingwrap = False
else:
self.__start_div_gclear = False
else:
self.data.append(self.__value)
self.__value = {}
self.__start_div_item = False
else:
self.__start_div_yingping = False
elif tag == 'a':
if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:
self.__start_a = False
elif tag == 'span':
self.__span_state = 0
def handle_data(self, data):
if self.__start_a:
self.__value['title'] = data
elif self.__span_state == 2:
self.__value['time'] = data
elif self.__span_state == 4:
score_text = re.search(r'\d+', data).group()
self.__value['people'] = int(score_text)
pass
def html_parser(html):
parser = CommentHTMLParser()
parser.feed(html)
return parser.data
关于Python3 中怎么解析html问题的解答就分享到这里了,希望以上内容可以对大家有一定的帮助,如果你还有很多疑惑没有解开,可以关注亿速云行业资讯频道了解更多相关知识。
亿速云「云服务器」,即开即用、新一代英特尔至强铂金CPU、三副本存储NVMe SSD云盘,价格低至29元/月。点击查看>>
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。