1.爬虫概述

  • 爬虫的用途
    • 数据分析/人工数据集
    • 社交软件冷启动
    • 竞争对手监控等
  • 爬虫分类
    • 通用爬虫
      • 百度、360、google等
      • 缺点:抓取的数据大多是无用的,不能根据用户的需求精准获取数据
    • 聚焦爬虫【学习的】
      • 根据需求,抓取需要的数据
  • 反爬手段
    • User-Agent:用户代理,简称UA,它是一个特殊字符串头,是的服务器能够识别客户使用的操作系统及版本等等
    • 代理IP
    • 验证码访问
    • 动态加载网页
    • 数据加密

2.urllib库使用

# 使用urllib来获取百度首页的源码
from urllib import request
# 1.模拟浏览器向服务端发送请求——request.urlopen(访问的地址)
response = request.urlopen("http://www.baidu.com")
# 2.获取响应中的页面源码
"""
    read方法返回的是字节形式的二进制数据,将二进制数据转为字符串————解码 decode("编码的格式”)
"""
content = response.read().decode("UTF-8")
# 3.打印数据
print(content)
  • response:一个类型,六个方法
方法名 作用
read() 字节形式读取二进制数据
readline() 读取一行数据
readlines 读取所有行直至结束
getcode() 获取状态码
geturl() 获取url
getheaders() 获取headers
from urllib import request

# 模拟浏览器向服务器发送请求
response = request.urlopen('http://www.baidu.com')
# response类型——HTTPResponse
print(type(response))
# 按照一个字节一个字节的读 read()
content = response.read()
print(content)
# 按照指定数量字节返回 read(数值)
content = response.read(10)
print(content)
# 读取一行 readline()  多行 readlines()
print(response.readline())
# 返回状态码 getcode()
print(response.getcode())
# 返回url地址  geturl()
print(response.geturl())
# 获取状态信息 getheaders()
print(response.getheaders())
  • 下载:request.urlretrieve(url,filename)
from urllib import request
"""
    request.urlretrieve(url, filename)
    url:下载的路径
    filename:下载的文件名称
"""
# 下载网页
request.urlretrieve("http://www.baidu.com", "./baidu.html")
# 下载图片
request.urlretrieve("https://img1.baidu.com/it/u=2463514011,1142503686&fm=253&fmt=auto&app=138&f=JPEG?w=888&h=500", "picture.jpg")
# 下载视频
request.urlretrieve("https://vd7.bdstatic.com/mda-ncsdzxnexbbtvdby/sc/cae_h264_delogo/1648375134877137115/mda-ncsdzxnexbbtvdby.mp4?v_from_s=hkapp-haokan-hbf&auth_key=1683274803-0-0-9aa8ec7ad50ebe1729d0a2a251360f47&bcevod_channel=searchbox_feed&pd=1&cd=0&pt=3&logid=3003349433&vid=11168769734631808674&abtest=109159_1-109432_1&klogid=3003349433&sdk_xcdn=1", "video.mp4")

3.请求对象的定制

  • UA介绍

    • User Agent中文名为用户代理,简称UA,它是一个特殊字符串头,使得服务器能够识别客户使用的操作系统及版本、CPU类型、浏览器及版本、浏览器内核等等
  • 语法

    request = urllib.request.Request(url,data,headers)
    
  • 示例

    # 通过https://www.baidu.com 无法获取到网页信息——反爬虫
    import urllib.request
    # 反爬虫UA
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
    }
    # 请求对象的定制 request.Request(url, 请求头UA)  【注意,位置参数第二个是data,这里需要关键字传参】
    request = urllib.request.Request(url="https://www.baidu.com", headers=headers)
    
    response = urllib.request.urlopen(request)
    
    print(response.read().decode("UTF-8"))
    

4.编解码

  • get请求方式:urllib.request.quote()

    # https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6 = https://www.baidu.com/s?wd=周杰伦
    from urllib import request, parse
    # 获取 https://www.baidu.com/s?wd=周杰伦
    
    # 将周杰伦三个字转为unicode编码格式——urllib.sparse.quote("周杰伦")
    url = f"https://www.baidu.com/s?wd={parse.quote('周杰伦')}"
    
    # 请求对象的定制,为了解决反爬虫的第一种手动,UA
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
    }
    # 请求对象的定制
    paramRequest = request.Request(url=url, headers=headers)
    # 模拟浏览器发向服务端发送请求
    response = request.urlopen(paramRequest)
    print(response.read().decode("UTF-8"))
    
  • get请求方式:urllib.request.urlencode()

    # 获取https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7&location=%E4%B8%AD%E5%9B%BD%E5%8F%B0%E6%B9%BE%E7%9C%81
    #    的网页源码
    
    import urllib.request,urllib.parse
    url = "https://www.baidu.com/s?"
    
    data = {
        "wd":"周杰伦",
        "sex":"男",
        "location":"中国台湾省"
    }
    data = urllib.parse.urlencode(data) # wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7&location=%E4%B8%AD%E5%9B%BD%E5%8F%B0%E6%B9%BE%E7%9C%81
    # 请求资源路径
    url = url+data      # https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7&location=%E4%B8%AD%E5%9B%BD%E5%8F%B0%E6%B9%BE%E7%9C%81
    # 防止反爬
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
    }
    paramRequest = urllib.request.Request(url=url, headers=headers)
    # 访问资源
    response = urllib.request.urlopen(paramRequest)
    print(response.read().decode("UTF-8"))
    
  • post请求方式:urllib.parse.urlencode(data).encode("UTF-8")

    import urllib.request,urllib.parse
    url = "https://fanyi.baidu.com/sug"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
    }
    # 参数
    data = {
        "kw":"spider"
    }
    # post请求参数必须要进行编码——byte编码:urllib.parse.urlencode(data).encode("UTF-8")
    data = urllib.parse.urlencode(data).encode("UTF-8")
    # post的请求参数需要放在请求对象定制的参数当中
    paramRequest = urllib.request.Request(url=url,data=data,headers=headers)
    # 模拟浏览器向服务器发送请求
    response = urllib.request.urlopen(paramRequest)
    # 响应数据
    content = response.read().decode("UTF-8")
    # 解析 str->json
    import json
    print(json.loads(content))
    
  • post和get的区别

    • get请求方式的参数必须编码,参数是拼接到url后面,编码后不需要调用encode方法
    • post请求方式的参数必须编码,参数是放在请求对象定制的方法中,编码后需要调用encode方法
  • 案例:百度详细翻译

import urllib.parse,urllib.request
url = "https://fanyi.baidu.com/v2transapi?from=en&to=zh"

headers = {
    #'Accept': '*/*',
    #'Accept-Encoding': 'gzip, deflate, br',
    #'Accept-Language': 'zh-CN,zh;q=0.9',
    #'Acs-Token': '1683294833496_1683294866698_LkSNNUVfH6cfR5UP++pWDoE0QA1qvlsA3ioNxDki0ReLU0VdXsIgbPKUJe13YXG0Hat88e9WhofgGXxu5vJ3QOKytAZ9+zirnFQsyOCrnbKEAqKDymGlthJmmZTvG61lRJLuCuGSgh8sN7Tp5n+vPEAPb8GC/30pEMncal3vTI+bnUSJ6mQ5/92C/dwzq2wgx6uVAVZD9Dw5TJvOq+Hgq7QtrGIoGGBG/QYOL8+tbVvXTcCRJ8esz16WJef8d0oyFwdHANuZYPHPggSSXsQiUv69b2S+GM127O5xA6lzmUWNxg7b0d8ypaQK7GXl9jHx3GxC0Gx4mVm+89H+l7HZpejF3add2LsGdAnVoYE9ZAdiYFH77/F9VIEJEgj6BByTwgJ8Fka+SEm1B0goV3k9KbNaYn9soMnHAe5PeXUT5veiHWhqMHYp+K443rTWRONT23ofsqyl1x+F+hOgPGqKQ0UFYOI66/rwCsDZiMXAsNFSU2EgdNe38FLh8yJj+I4+',
    #'Connection': 'keep-alive',
    #'Content-Length': '135',
    #'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Cookie': 'BIDUPSID=2DE3E3D663B72B4BC835EDB5B172B2F1; PSTM=1683294818; BAIDUID=2DE3E3D663B72B4BF506C0091DF59930:FG=1; BA_HECTOR=2g0k21040g2k21ah20ah814q1i5a2j21m; BAIDUID_BFESS=2DE3E3D663B72B4BF506C0091DF59930:FG=1; ZFY=iRhz0VKUtwbSH69ydEls11T9mOe:BIi3vnc7rCt1nOnE:C; PSINO=1; H_PS_PSSID=38515_36545_38529_38469_38468_38486_37709_38504_26350_38570_38543; delPer=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BCLID=10626756841766805276; BCLID_BFESS=10626756841766805276; BDSFRCVID=r_0OJexroG07VWbfoDb7u0fNA_weG7bTDYrEOwXPsp3LGJLVFe3JEG0Pts1-dEu-S2OOogKKBeOTHg_F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=r_0OJexroG07VWbfoDb7u0fNA_weG7bTDYrEOwXPsp3LGJLVFe3JEG0Pts1-dEu-S2OOogKKBeOTHg_F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tRAOoC_-tDvDqTrP-trf5DCShUFsbq4jB2Q-XPoO3KtbSx3Pb4nU-6-0DMjw3lRf5mkf3fbgy4op8P3y0bb2DUA1y4vp0tLeWeTxoUJ2-KDVeh5Gqq-KXU4ebPRi3tQ9QgbMMhQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHj8aj6jQ3D; H_BDCLCKID_SF_BFESS=tRAOoC_-tDvDqTrP-trf5DCShUFsbq4jB2Q-XPoO3KtbSx3Pb4nU-6-0DMjw3lRf5mkf3fbgy4op8P3y0bb2DUA1y4vp0tLeWeTxoUJ2-KDVeh5Gqq-KXU4ebPRi3tQ9QgbMMhQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHj8aj6jQ3D; APPGUIDE_10_0_2=1; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1683294832; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1683294832; ab_sr=1.0.1_MTBkZmQwNzVkMzcyZjY2MTM1MTQyNzM2MzRmYzczNTlmZjY4YzczZmQ3MDVhNmRiNWEyZWNhNjc1MTRkMjFhZWU3ZTk0MTY1NGU0NTFkYzRkNmI1OGI1YTExZGVjNGJjZTkxMjY1YzJkNTI1MzgxOTZhNDg1NzQxZmEyZDZlNWU5OWFmZGUwNDJhY2I2ZGQ5N2MyYjU1NTJkZjI2NmUzYQ==',
    #'Host': 'fanyi.baidu.com',
    #'Origin': 'https://fanyi.baidu.com',
    #'Referer': 'https://fanyi.baidu.com/?aldtype=16047',
    #'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
    #'sec-ch-ua-mobile': '?0',
    #'sec-ch-ua-platform': '"Windows"',
    #'Sec-Fetch-Dest': 'empty',
    #'Sec-Fetch-Mode': 'cors',
    #'Sec-Fetch-Site': 'same-origin',
    #'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
    #'X-Requested-With': 'XMLHttpRequest'
}
# 参数
data = {
    'from': 'en',
    'to': 'zh',
    'query': 'love',
    'transtype': 'realtime',
    'simple_means_flag': '3',
    'sign': '198772.518981',
    'token': '469aebfdada94cf9299fdf782c731610',
    'domain': 'common'
}
# 参数进行编码
data = urllib.parse.urlencode(data).encode("UTF-8")
# 请求对象定制
paramRequest = urllib.request.Request(url, data, headers)
# 模拟发送请求
response = urllib.request.urlopen(paramRequest)
content = response.read().decode("UTF-8")
import json
print(json.loads(content))

5.ajax的get请求

  • 豆瓣电影:下载第一页的数据
import urllib.request
url = "https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}
paramRequest = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(paramRequest)

content = response.read().decode("UTF-8")
with open('./data/douban1.json', 'w', encoding="UTF-8") as f:
    f.write(content)
  • 豆瓣电影:下载前10页的数据
import urllib.parse
import urllib.request
# 请求对象的定制
def createRequest(page):
    baseUrl = "https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action="
    data = {
        "start": (page - 1)*20,
        "limit": 20
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
    }
    url = baseUrl + urllib.parse.urlencode(data)
    request = urllib.request.Request(url=url, headers=headers)
    return request
# 获取到数据
def getConnection(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode("UTF-8")
    return content
# 下载数据
def downLoad(page,content):
    with open(f"./data/douban_{page}.json", "w", encoding="UTF-8") as f:
        f.write(content)

if __name__ == '__main__':
    startPage = int(input("请输入起始页码:"))
    endPage = int(input("请输入结束页码:"))
    for page in range(startPage,endPage+1):
        # 请求对象的定制
        request = createRequest(page)
        # 获取数据
        content = getConnection(request)
        # 下载
        downLoad(page, content)

6.ajax的post请求

  • 案例:肯德基官网
import urllib.parse,urllib.request

url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}
def createRequest(page):
    data = {
        'cname': '北京',
        'pid': '',
        'pageIndex': page,
        'pageSize': '10',
    }
    data = urllib.parse.urlencode(data).encode("UTF-8")
    request = urllib.request.Request(url,data,headers)
    return request
# 获取内容
def getContent(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode("UTF-8")
    return content
# 下载数据
def downLoad(page,content):
    with open(f'kfc{page}.json',"w",encoding="UTF-8") as f:
        f.write(content)
if __name__ == '__main__':
    startPage = int(input("请输入起始页数:"))
    endPage = int(input("请输入结束页数:"))
    for page in range(startPage,endPage+1):
        # 请求对象的定制
        request = createRequest(page)
        # 获取内容数据
        content = getContent(request)
        # 下载数据
        downLoad(page,content)

7.URLError\HTTPError

  • HTTPError是URLError的子类
import urllib.request,urllib.error

url = "https://blog.csdn.net/youyouxiong/article/details/125141038" #HTTPError

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}

try:
    paramRequst = urllib.request.Request(url=url, headers=headers)

    response = urllib.request.urlopen(paramRequst)

    print(response.read().decode("UTF-8"))
except urllib.error.HTTPError:
    print("出错了,系统正在升级...")
except urllib.error.URLError:
    print("URL出错了...")

8.Cookie登录

  • 适用场景:数据采集的时候,需要绕过登录,然后进入到某个页面
    • 个人信息页面是utf-8,但是还报错了编码错误,因为并没有进入到个人信息页面,而是跳转到了登录页面——登录页面不是utf-8 所以会报错
  • 案例:weibo登录
# 访问不成功——请求头的信息不够
import urllib.request

url = "https://weibo.com/set/index"

headers = {
    #':authority': 'rm.api.weibo.com',
    #':method': 'GET',
    #':path': '/2/remind/push_count.json?trim_null=1&with_dm_group=1&with_reminding=1&with_settings=1&exclude_attitude=1&with_chat_group_notice=1&source=339644097&with_chat_group=1&with_dm_unread=1&callback=__jp0',
    #':scheme': 'https',
    'accept': '*/*',
    #'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    # cookie中携带者你的登录信息,如果又登录之后的cookie,那么就可以进入到任何页面
    'cookie': '_s_tentry=weibo.com; appkey=; Apache=2573814727969.865.1683334145542; SINAGLOBAL=2573814727969.865.1683334145542; ULV=1683334145546:1:1:1:2573814727969.865.1683334145542:; login_sid_t=5e64f8037a70e7b19f44fc4957ef2b58; cross_origin_proto=SSL; SUB=_2A25JUdzhDeThGeFO6lAS8inKzDyIHXVqJ0kprDV8PUNbmtANLVTwkW9NQZ4igTsqKDbmYHLfOeLx7ZbeK03UGOX8; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFAmWP.PnUSj2a-kPEpLx3x5JpX5KzhUgL.FoM7eKz0eoMcS052dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNeh2Ee0zNSoM7; ALF=1714872368; SSOLoginState=1683336369',
    # 判断当前路径是不是由上一个路径进来的,一般情况下是做图片防盗链
    'referer': 'https://weibo.com/',
    'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'script',
    'sec-fetch-mode': 'no-cors',
    'sec-fetch-site': 'same-site',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
}
paramRequest = urllib.request.Request(url=url,headers=headers)

response = urllib.request.urlopen(paramRequest)

content = response.read().decode("UTF-8")
print(content)

9.Handler处理器

  • 学习原由
    • urllib.request.urlopen(url):不能定制请求头
    • urllib.request.Request(url,data,headers):可以定制请求头
    • Handler:定制更高级的请求头(动态cookie和代理)
import urllib.request
url = "https://www.baidu.com"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}

paramRequest = urllib.request.Request(url=url,headers=headers)

# 1.获取handler对象:urllib.request.HTTPHandler()
handler = urllib.request.HTTPHandler()
# 2.获取opener对象:urllib.request.builder_opener(handlder)
opener = urllib.request.build_opener(handler)
# 3.调用open方法:openner.open(request)
reponse = opener.open(paramRequest)
print(reponse.read().decode("UTF-8"))

10.代理服务器

  • 代理常用功能
    • 突破自身IP访问限制,访问国外站点
    • 访问一些单位或团体内部资源
    • 提高访问速度
    • 隐藏真实IP
  • 代码配置代理
import urllib.request

url = "https://www.baidu.com/s?wd=ip"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}
paramRequest = urllib.request.Request(url=url,headers=headers)
proxies = {
    "http":"221.176.140.214"
}
handler = urllib.request.ProxyHandler(proxies=proxies)

opener = urllib.request.build_opener(handler)

response = opener.open(paramRequest)
print(response.read().decode("UTF-8"))
  • 代理池
import urllib.request,urllib.parse
import random


proxiesPool = [
    {"http":"117.114.149.66:55443"},
    {"http":"210.5.10.87:53281"}
]
proxies = random.choice(proxiesPool)
url = f"https://www.baidu.com/s?wd=ip"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}
request = urllib.request.Request(url=url,headers=headers)
# 得到handler
handler = urllib.request.ProxyHandler(proxies=proxies)
# 得到opener对象
opener = urllib.request.build_opener(handler)
# 发送请求
response = opener.open(request)
content = response.read().decode("UTF-8")
with open('ip.html',"w",encoding="UTF-8") as f:
    f.write(content)

results matching ""

    No results matching ""