BeautifulSoup 下载:http://www.crummy.com/software/BeautifulSoup/
# -*- coding: utf-8 -*- import urllib2 import re from bs4 import BeautifulSoup homeUrl = 'http://www.jokeji.cn/' startUrl = 'http://www.jokeji.cn/JokeHtml/bxnn/2014113023321188.htm' # 开始页面 nextPageUrlReg = r'(<div class=zw_page1>[\s\S]*<a href="../../)([\w\d\/\.]+)("[\s\S]*class=zw_page2>)' # 判断是否有下一页 page = 1 searchCount = 10 lists = [] def main(url): searchByUrl(url) for v in lists: print v def searchByUrl(url): global page response = urllib2.urlopen(url) html = response.read() searchItems(html) # 如果有下一页并且小于要查询的页数,递归查询 nextPage = hasNextPage(html) if nextPage != None and page < searchCount: searchByUrl(nextPage) page = page + 1 def searchItems(html): # 参数from_encoding为 gb18030 防止中文乱码 soup = BeautifulSoup(html, from_encoding = "gb18030") # 查找ID为text110的元素,并获取内容 content = soup.find(id='text110') if content != None: # 查找P的所有元素,不能使用 content.contents() 获取并遍历 for v in content.findAll('p'): #print v.get_text() # get_text() 去除html标签,获取内容 lists.append(v.get_text()) return lists def hasNextPage(html): if re.search(nextPageUrlReg, html) != None: PageName_temp = re.findall(nextPageUrlReg, html) return homeUrl + PageName_temp[0][1] else: return None main(startUrl)