# 使用urllib来获取百度首页的源码
from urllib import request
# 1.模拟浏览器向服务端发送请求——request.urlopen(访问的地址)
response = request.urlopen("http://www.baidu.com")
# 2.获取响应中的页面源码
"""
read方法返回的是字节形式的二进制数据,将二进制数据转为字符串————解码 decode("编码的格式”)
"""
content = response.read().decode("UTF-8")
# 3.打印数据
print(content)
方法名 | 作用 |
---|---|
read() | 字节形式读取二进制数据 |
readline() | 读取一行数据 |
readlines | 读取所有行直至结束 |
getcode() | 获取状态码 |
geturl() | 获取url |
getheaders() | 获取headers |
from urllib import request
# 模拟浏览器向服务器发送请求
response = request.urlopen('http://www.baidu.com')
# response类型——HTTPResponse
print(type(response))
# 按照一个字节一个字节的读 read()
content = response.read()
print(content)
# 按照指定数量字节返回 read(数值)
content = response.read(10)
print(content)
# 读取一行 readline() 多行 readlines()
print(response.readline())
# 返回状态码 getcode()
print(response.getcode())
# 返回url地址 geturl()
print(response.geturl())
# 获取状态信息 getheaders()
print(response.getheaders())
from urllib import request
"""
request.urlretrieve(url, filename)
url:下载的路径
filename:下载的文件名称
"""
# 下载网页
request.urlretrieve("http://www.baidu.com", "./baidu.html")
# 下载图片
request.urlretrieve("https://img1.baidu.com/it/u=2463514011,1142503686&fm=253&fmt=auto&app=138&f=JPEG?w=888&h=500", "picture.jpg")
# 下载视频
request.urlretrieve("https://vd7.bdstatic.com/mda-ncsdzxnexbbtvdby/sc/cae_h264_delogo/1648375134877137115/mda-ncsdzxnexbbtvdby.mp4?v_from_s=hkapp-haokan-hbf&auth_key=1683274803-0-0-9aa8ec7ad50ebe1729d0a2a251360f47&bcevod_channel=searchbox_feed&pd=1&cd=0&pt=3&logid=3003349433&vid=11168769734631808674&abtest=109159_1-109432_1&klogid=3003349433&sdk_xcdn=1", "video.mp4")
UA介绍
语法
request = urllib.request.Request(url,data,headers)
示例
# 通过https://www.baidu.com 无法获取到网页信息——反爬虫
import urllib.request
# 反爬虫UA
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}
# 请求对象的定制 request.Request(url, 请求头UA) 【注意,位置参数第二个是data,这里需要关键字传参】
request = urllib.request.Request(url="https://www.baidu.com", headers=headers)
response = urllib.request.urlopen(request)
print(response.read().decode("UTF-8"))
get请求方式:urllib.request.quote()
# https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6 = https://www.baidu.com/s?wd=周杰伦
from urllib import request, parse
# 获取 https://www.baidu.com/s?wd=周杰伦
# 将周杰伦三个字转为unicode编码格式——urllib.sparse.quote("周杰伦")
url = f"https://www.baidu.com/s?wd={parse.quote('周杰伦')}"
# 请求对象的定制,为了解决反爬虫的第一种手动,UA
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}
# 请求对象的定制
paramRequest = request.Request(url=url, headers=headers)
# 模拟浏览器发向服务端发送请求
response = request.urlopen(paramRequest)
print(response.read().decode("UTF-8"))
get请求方式:urllib.request.urlencode()
# 获取https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7&location=%E4%B8%AD%E5%9B%BD%E5%8F%B0%E6%B9%BE%E7%9C%81
# 的网页源码
import urllib.request,urllib.parse
url = "https://www.baidu.com/s?"
data = {
"wd":"周杰伦",
"sex":"男",
"location":"中国台湾省"
}
data = urllib.parse.urlencode(data) # wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7&location=%E4%B8%AD%E5%9B%BD%E5%8F%B0%E6%B9%BE%E7%9C%81
# 请求资源路径
url = url+data # https://www.baidu.com/s?wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7&location=%E4%B8%AD%E5%9B%BD%E5%8F%B0%E6%B9%BE%E7%9C%81
# 防止反爬
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}
paramRequest = urllib.request.Request(url=url, headers=headers)
# 访问资源
response = urllib.request.urlopen(paramRequest)
print(response.read().decode("UTF-8"))
post请求方式:urllib.parse.urlencode(data).encode("UTF-8")
import urllib.request,urllib.parse
url = "https://fanyi.baidu.com/sug"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}
# 参数
data = {
"kw":"spider"
}
# post请求参数必须要进行编码——byte编码:urllib.parse.urlencode(data).encode("UTF-8")
data = urllib.parse.urlencode(data).encode("UTF-8")
# post的请求参数需要放在请求对象定制的参数当中
paramRequest = urllib.request.Request(url=url,data=data,headers=headers)
# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(paramRequest)
# 响应数据
content = response.read().decode("UTF-8")
# 解析 str->json
import json
print(json.loads(content))
post和get的区别
案例:百度详细翻译
import urllib.parse,urllib.request
url = "https://fanyi.baidu.com/v2transapi?from=en&to=zh"
headers = {
#'Accept': '*/*',
#'Accept-Encoding': 'gzip, deflate, br',
#'Accept-Language': 'zh-CN,zh;q=0.9',
#'Acs-Token': '1683294833496_1683294866698_LkSNNUVfH6cfR5UP++pWDoE0QA1qvlsA3ioNxDki0ReLU0VdXsIgbPKUJe13YXG0Hat88e9WhofgGXxu5vJ3QOKytAZ9+zirnFQsyOCrnbKEAqKDymGlthJmmZTvG61lRJLuCuGSgh8sN7Tp5n+vPEAPb8GC/30pEMncal3vTI+bnUSJ6mQ5/92C/dwzq2wgx6uVAVZD9Dw5TJvOq+Hgq7QtrGIoGGBG/QYOL8+tbVvXTcCRJ8esz16WJef8d0oyFwdHANuZYPHPggSSXsQiUv69b2S+GM127O5xA6lzmUWNxg7b0d8ypaQK7GXl9jHx3GxC0Gx4mVm+89H+l7HZpejF3add2LsGdAnVoYE9ZAdiYFH77/F9VIEJEgj6BByTwgJ8Fka+SEm1B0goV3k9KbNaYn9soMnHAe5PeXUT5veiHWhqMHYp+K443rTWRONT23ofsqyl1x+F+hOgPGqKQ0UFYOI66/rwCsDZiMXAsNFSU2EgdNe38FLh8yJj+I4+',
#'Connection': 'keep-alive',
#'Content-Length': '135',
#'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'BIDUPSID=2DE3E3D663B72B4BC835EDB5B172B2F1; PSTM=1683294818; BAIDUID=2DE3E3D663B72B4BF506C0091DF59930:FG=1; BA_HECTOR=2g0k21040g2k21ah20ah814q1i5a2j21m; BAIDUID_BFESS=2DE3E3D663B72B4BF506C0091DF59930:FG=1; ZFY=iRhz0VKUtwbSH69ydEls11T9mOe:BIi3vnc7rCt1nOnE:C; PSINO=1; H_PS_PSSID=38515_36545_38529_38469_38468_38486_37709_38504_26350_38570_38543; delPer=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BCLID=10626756841766805276; BCLID_BFESS=10626756841766805276; BDSFRCVID=r_0OJexroG07VWbfoDb7u0fNA_weG7bTDYrEOwXPsp3LGJLVFe3JEG0Pts1-dEu-S2OOogKKBeOTHg_F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=r_0OJexroG07VWbfoDb7u0fNA_weG7bTDYrEOwXPsp3LGJLVFe3JEG0Pts1-dEu-S2OOogKKBeOTHg_F_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tRAOoC_-tDvDqTrP-trf5DCShUFsbq4jB2Q-XPoO3KtbSx3Pb4nU-6-0DMjw3lRf5mkf3fbgy4op8P3y0bb2DUA1y4vp0tLeWeTxoUJ2-KDVeh5Gqq-KXU4ebPRi3tQ9QgbMMhQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHj8aj6jQ3D; H_BDCLCKID_SF_BFESS=tRAOoC_-tDvDqTrP-trf5DCShUFsbq4jB2Q-XPoO3KtbSx3Pb4nU-6-0DMjw3lRf5mkf3fbgy4op8P3y0bb2DUA1y4vp0tLeWeTxoUJ2-KDVeh5Gqq-KXU4ebPRi3tQ9QgbMMhQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0HPonHj8aj6jQ3D; APPGUIDE_10_0_2=1; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1683294832; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1683294832; ab_sr=1.0.1_MTBkZmQwNzVkMzcyZjY2MTM1MTQyNzM2MzRmYzczNTlmZjY4YzczZmQ3MDVhNmRiNWEyZWNhNjc1MTRkMjFhZWU3ZTk0MTY1NGU0NTFkYzRkNmI1OGI1YTExZGVjNGJjZTkxMjY1YzJkNTI1MzgxOTZhNDg1NzQxZmEyZDZlNWU5OWFmZGUwNDJhY2I2ZGQ5N2MyYjU1NTJkZjI2NmUzYQ==',
#'Host': 'fanyi.baidu.com',
#'Origin': 'https://fanyi.baidu.com',
#'Referer': 'https://fanyi.baidu.com/?aldtype=16047',
#'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
#'sec-ch-ua-mobile': '?0',
#'sec-ch-ua-platform': '"Windows"',
#'Sec-Fetch-Dest': 'empty',
#'Sec-Fetch-Mode': 'cors',
#'Sec-Fetch-Site': 'same-origin',
#'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
#'X-Requested-With': 'XMLHttpRequest'
}
# 参数
data = {
'from': 'en',
'to': 'zh',
'query': 'love',
'transtype': 'realtime',
'simple_means_flag': '3',
'sign': '198772.518981',
'token': '469aebfdada94cf9299fdf782c731610',
'domain': 'common'
}
# 参数进行编码
data = urllib.parse.urlencode(data).encode("UTF-8")
# 请求对象定制
paramRequest = urllib.request.Request(url, data, headers)
# 模拟发送请求
response = urllib.request.urlopen(paramRequest)
content = response.read().decode("UTF-8")
import json
print(json.loads(content))
import urllib.request
url = "https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}
paramRequest = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(paramRequest)
content = response.read().decode("UTF-8")
with open('./data/douban1.json', 'w', encoding="UTF-8") as f:
f.write(content)
import urllib.parse
import urllib.request
# 请求对象的定制
def createRequest(page):
baseUrl = "https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action="
data = {
"start": (page - 1)*20,
"limit": 20
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}
url = baseUrl + urllib.parse.urlencode(data)
request = urllib.request.Request(url=url, headers=headers)
return request
# 获取到数据
def getConnection(request):
response = urllib.request.urlopen(request)
content = response.read().decode("UTF-8")
return content
# 下载数据
def downLoad(page,content):
with open(f"./data/douban_{page}.json", "w", encoding="UTF-8") as f:
f.write(content)
if __name__ == '__main__':
startPage = int(input("请输入起始页码:"))
endPage = int(input("请输入结束页码:"))
for page in range(startPage,endPage+1):
# 请求对象的定制
request = createRequest(page)
# 获取数据
content = getConnection(request)
# 下载
downLoad(page, content)
import urllib.parse,urllib.request
url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}
def createRequest(page):
data = {
'cname': '北京',
'pid': '',
'pageIndex': page,
'pageSize': '10',
}
data = urllib.parse.urlencode(data).encode("UTF-8")
request = urllib.request.Request(url,data,headers)
return request
# 获取内容
def getContent(request):
response = urllib.request.urlopen(request)
content = response.read().decode("UTF-8")
return content
# 下载数据
def downLoad(page,content):
with open(f'kfc{page}.json',"w",encoding="UTF-8") as f:
f.write(content)
if __name__ == '__main__':
startPage = int(input("请输入起始页数:"))
endPage = int(input("请输入结束页数:"))
for page in range(startPage,endPage+1):
# 请求对象的定制
request = createRequest(page)
# 获取内容数据
content = getContent(request)
# 下载数据
downLoad(page,content)
import urllib.request,urllib.error
url = "https://blog.csdn.net/youyouxiong/article/details/125141038" #HTTPError
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}
try:
paramRequst = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(paramRequst)
print(response.read().decode("UTF-8"))
except urllib.error.HTTPError:
print("出错了,系统正在升级...")
except urllib.error.URLError:
print("URL出错了...")
# 访问不成功——请求头的信息不够
import urllib.request
url = "https://weibo.com/set/index"
headers = {
#':authority': 'rm.api.weibo.com',
#':method': 'GET',
#':path': '/2/remind/push_count.json?trim_null=1&with_dm_group=1&with_reminding=1&with_settings=1&exclude_attitude=1&with_chat_group_notice=1&source=339644097&with_chat_group=1&with_dm_unread=1&callback=__jp0',
#':scheme': 'https',
'accept': '*/*',
#'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
# cookie中携带者你的登录信息,如果又登录之后的cookie,那么就可以进入到任何页面
'cookie': '_s_tentry=weibo.com; appkey=; Apache=2573814727969.865.1683334145542; SINAGLOBAL=2573814727969.865.1683334145542; ULV=1683334145546:1:1:1:2573814727969.865.1683334145542:; login_sid_t=5e64f8037a70e7b19f44fc4957ef2b58; cross_origin_proto=SSL; SUB=_2A25JUdzhDeThGeFO6lAS8inKzDyIHXVqJ0kprDV8PUNbmtANLVTwkW9NQZ4igTsqKDbmYHLfOeLx7ZbeK03UGOX8; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFAmWP.PnUSj2a-kPEpLx3x5JpX5KzhUgL.FoM7eKz0eoMcS052dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNeh2Ee0zNSoM7; ALF=1714872368; SSOLoginState=1683336369',
# 判断当前路径是不是由上一个路径进来的,一般情况下是做图片防盗链
'referer': 'https://weibo.com/',
'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'script',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
}
paramRequest = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(paramRequest)
content = response.read().decode("UTF-8")
print(content)
import urllib.request
url = "https://www.baidu.com"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}
paramRequest = urllib.request.Request(url=url,headers=headers)
# 1.获取handler对象:urllib.request.HTTPHandler()
handler = urllib.request.HTTPHandler()
# 2.获取opener对象:urllib.request.builder_opener(handlder)
opener = urllib.request.build_opener(handler)
# 3.调用open方法:openner.open(request)
reponse = opener.open(paramRequest)
print(reponse.read().decode("UTF-8"))
import urllib.request
url = "https://www.baidu.com/s?wd=ip"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}
paramRequest = urllib.request.Request(url=url,headers=headers)
proxies = {
"http":"221.176.140.214"
}
handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(paramRequest)
print(response.read().decode("UTF-8"))
import urllib.request,urllib.parse
import random
proxiesPool = [
{"http":"117.114.149.66:55443"},
{"http":"210.5.10.87:53281"}
]
proxies = random.choice(proxiesPool)
url = f"https://www.baidu.com/s?wd=ip"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
}
request = urllib.request.Request(url=url,headers=headers)
# 得到handler
handler = urllib.request.ProxyHandler(proxies=proxies)
# 得到opener对象
opener = urllib.request.build_opener(handler)
# 发送请求
response = opener.open(request)
content = response.read().decode("UTF-8")
with open('ip.html',"w",encoding="UTF-8") as f:
f.write(content)