python抓取静态页面

九九

8 年前

尝试用python抓取静态页面,似乎比起C#更容易实现,现成的类库也是蛮多的。

用基础模块写了一段简单的抓取糗事百科首页笑话列表：

#coding: utf-8
# import urllib.request as ur
import sqlite3
from lxml import etree #xpath树模块
import traceback #报错模块
import requests
import urllib #python3中没有urllib2了
# import json
from sys import argv #读写文档模块


def getHtml(url):
  useragent='Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
  referer=url
  headers={
    'User-Agent':useragent,
    'Referer':referer
  } # 请求头
  req=urllib.request.Request(url,headers=headers)
  res=urllib.request.urlopen(req)
  html=res.read() 
  # res.read().decode('utf-8')写入文件报"a bytes like object is required, not str"错误
  # 具体原因可以查看这里:https://docs.python.org/release/3.0.1/whatsnew/3.0.html#text-vs-data-instead-of-unicode-vs-8-bit
  # 或者 https://stackoverflow.com/questions/29643544/python-a-bytes-like-object-is-required-not-str
  return html

# 创建表单并保存到数据库(备用)
def createTable():
  conn=sqlite3.connect('t.db')
  conn.execute('''create table article
  (
    id int primary key not null,
    title text not null,
    author text not null,
    content text not null,
    publictime text
  )
  ''')
def traToTree(html):
  return etree.HTML(html)
def getDom(tree,xp):
  return tree.xpath(xp)
def saveToFile(html):
  file=open('file/index.txt','wb')
  # file.truncate()
  # print(html)
  file.write(html)
  file.close()

try:
  html=getHtml('http://www.qiushibaike.com/hot/')
  # saveToFile(html)
  tree= traToTree(html)
  # title=getDom(tree,'//*[@id="cb_post_title_url"]')[0].text
  # 抓不到可能就是ajax方式请求
  paragraphs=getDom(tree,'//*[@class="content"]')
  # 使用xpath,稍微简单些

  index=1
  for paragraph in paragraphs:
    # str(index) 数字转字符串
    print(str(index)+':'+paragraph[0].text)
    # index++ python 没有这种自增的方法
    index+=1
except:
  traceback.print_exc()

简单效果输出:
print log C:\py>python txt.py 1:像我们这个年纪 2:老爸开小卖店的，老鼠很猖獗，于是给他买了只猫，现在好了，老鼠吃饭有伴了。。。 3:有木有和我一样的，没开工资想好了买什么买什么，开了工资什么都不舍的买，然后工资也不知道怎么花的莫名其妙就没了的 ..........