这篇“Python3中如何解析html”文章的知识点大部分人都不太理解,所以小编给大家总结了以下内容,内容详细,步骤清晰,具有一定的借鉴价值,希望大家阅读完这篇文章能有所收获,下面我们一起来看看这篇“Python3中如何解析html”文章吧。
辅助函数,主要用于获取html并输入解析后的结束
#把传递解析函数,便于下面的修改 def get_html(url, paraser=bs4_paraser): headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Host': 'www.360kan.com', 'Proxy-Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' } request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) response.encoding = 'utf-8' if response.code == 200: data = StringIO.StringIO(response.read()) gzipper = gzip.GzipFile(fileobj=data) data = gzipper.read() value = paraser(data) # open('E:/h6/haPkY0osd0r5UB.html').read() return value else: pass value = get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html', paraser=lxml_parser) for row in value: print row
1,lxml.html的方式进行解析,
def lxml_parser(page): data = [] doc = etree.HTML(page) all_div = doc.xpath('//div[@class="yingping-list-wrap"]') for row in all_div: # 获取每一个影评,即影评的item all_div_item = row.xpath('.//div[@class="item"]') # find_all('div', attrs={'class': 'item'}) for r in all_div_item: value = {} # 获取影评的标题部分 title = r.xpath('.//div[@class="g-clear title-wrap"][1]') value['title'] = title[0].xpath('./a/text()')[0] value['title_href'] = title[0].xpath('./a/@href')[0] score_text = title[0].xpath('./div/span/span/@style')[0] score_text = re.search(r'\d+', score_text).group() value['score'] = int(score_text) / 20 # 时间 value['time'] = title[0].xpath('./div/span[@class="time"]/text()')[0] # 多少人喜欢 value['people'] = int( re.search(r'\d+', title[0].xpath('./div[@class="num"]/span/text()')[0]).group()) data.append(value) return data
2,使用BeautifulSoup,不多说了,大家网上找资料看看
def bs4_paraser(html): all_value = [] value = {} soup = BeautifulSoup(html, 'html.parser') # 获取影评的部分 all_div = soup.find_all('div', attrs={'class': 'yingping-list-wrap'}, limit=1) for row in all_div: # 获取每一个影评,即影评的item all_div_item = row.find_all('div', attrs={'class': 'item'}) for r in all_div_item: # 获取影评的标题部分 title = r.find_all('div', attrs={'class': 'g-clear title-wrap'}, limit=1) if title is not None and len(title) > 0: value['title'] = title[0].a.string value['title_href'] = title[0].a['href'] score_text = title[0].div.span.span['style'] score_text = re.search(r'\d+', score_text).group() value['score'] = int(score_text) / 20 # 时间 value['time'] = title[0].div.find_all('span', attrs={'class': 'time'})[0].string # 多少人喜欢 value['people'] = int( re.search(r'\d+', title[0].find_all('div', attrs={'class': 'num'})[0].span.string).group()) # print r all_value.append(value) value = {} return all_value
3,使用SGMLParser,主要是通过start、end tag的方式进行了,解析工程比较明朗,但是有点麻烦,而且该案例的场景不太适合该方法,(哈哈)
class CommentParaser(SGMLParser): def __init__(self): SGMLParser.__init__(self) self.__start_div_yingping = False self.__start_div_item = False self.__start_div_gclear = False self.__start_div_ratingwrap = False self.__start_div_num = False # a self.__start_a = False # span 3中状态 self.__span_state = 0 # 数据 self.__value = {} self.data = [] def start_div(self, attrs): for k, v in attrs: if k == 'class' and v == 'yingping-list-wrap': self.__start_div_yingping = True elif k == 'class' and v == 'item': self.__start_div_item = True elif k == 'class' and v == 'g-clear title-wrap': self.__start_div_gclear = True elif k == 'class' and v == 'rating-wrap g-clear': self.__start_div_ratingwrap = True elif k == 'class' and v == 'num': self.__start_div_num = True def end_div(self): if self.__start_div_yingping: if self.__start_div_item: if self.__start_div_gclear: if self.__start_div_num or self.__start_div_ratingwrap: if self.__start_div_num: self.__start_div_num = False if self.__start_div_ratingwrap: self.__start_div_ratingwrap = False else: self.__start_div_gclear = False else: self.data.append(self.__value) self.__value = {} self.__start_div_item = False else: self.__start_div_yingping = False def start_a(self, attrs): if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear: self.__start_a = True for k, v in attrs: if k == 'href': self.__value['href'] = v def end_a(self): if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a: self.__start_a = False def start_span(self, attrs): if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear: if self.__start_div_ratingwrap: if self.__span_state != 1: for k, v in attrs: if k == 'class' and v == 'rating': self.__span_state = 1 elif k == 'class' and v == 'time': self.__span_state = 2 else: for k, v in attrs: if k == 'style': score_text = re.search(r'\d+', v).group() self.__value['score'] = int(score_text) / 20 self.__span_state = 3 elif self.__start_div_num: self.__span_state = 4 def end_span(self): self.__span_state = 0 def handle_data(self, data): if self.__start_a: self.__value['title'] = data elif self.__span_state == 2: self.__value['time'] = data elif self.__span_state == 4: score_text = re.search(r'\d+', data).group() self.__value['people'] = int(score_text) pass def sgl_parser(html): parser = CommentParaser() parser.feed(html) return parser.data
4,HTMLParaer,与3原理相识,就是调用的方法不太一样,基本上可以公用,
class CommentHTMLParser(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser.__init__(self) self.__start_div_yingping = False self.__start_div_item = False self.__start_div_gclear = False self.__start_div_ratingwrap = False self.__start_div_num = False # a self.__start_a = False # span 3中状态 self.__span_state = 0 # 数据 self.__value = {} self.data = [] def handle_starttag(self, tag, attrs): if tag == 'div': for k, v in attrs: if k == 'class' and v == 'yingping-list-wrap': self.__start_div_yingping = True elif k == 'class' and v == 'item': self.__start_div_item = True elif k == 'class' and v == 'g-clear title-wrap': self.__start_div_gclear = True elif k == 'class' and v == 'rating-wrap g-clear': self.__start_div_ratingwrap = True elif k == 'class' and v == 'num': self.__start_div_num = True elif tag == 'a': if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear: self.__start_a = True for k, v in attrs: if k == 'href': self.__value['href'] = v elif tag == 'span': if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear: if self.__start_div_ratingwrap: if self.__span_state != 1: for k, v in attrs: if k == 'class' and v == 'rating': self.__span_state = 1 elif k == 'class' and v == 'time': self.__span_state = 2 else: for k, v in attrs: if k == 'style': score_text = re.search(r'\d+', v).group() self.__value['score'] = int(score_text) / 20 self.__span_state = 3 elif self.__start_div_num: self.__span_state = 4 def handle_endtag(self, tag): if tag == 'div': if self.__start_div_yingping: if self.__start_div_item: if self.__start_div_gclear: if self.__start_div_num or self.__start_div_ratingwrap: if self.__start_div_num: self.__start_div_num = False if self.__start_div_ratingwrap: self.__start_div_ratingwrap = False else: self.__start_div_gclear = False else: self.data.append(self.__value) self.__value = {} self.__start_div_item = False else: self.__start_div_yingping = False elif tag == 'a': if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a: self.__start_a = False elif tag == 'span': self.__span_state = 0 def handle_data(self, data): if self.__start_a: self.__value['title'] = data elif self.__span_state == 2: self.__value['time'] = data elif self.__span_state == 4: score_text = re.search(r'\d+', data).group() self.__value['people'] = int(score_text) pass def html_parser(html): parser = CommentHTMLParser() parser.feed(html) return parser.data
以上就是关于“Python3中如何解析html”这篇文章的内容,相信大家都有了一定的了解,希望小编分享的内容对大家有帮助,若想了解更多相关的知识内容,请关注亿速云行业资讯频道。
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。