案例一:搜狗html页面爬取
import requests

if __name__ == "__main__":
    #step 1:指定url
    url = "https://www.sougou.com/"
    #step 2:发起请求
    #get方法会返回一个响应对象
    response = requests.get(url=url)
    #step 3::获取响应对象数据,text返回的是字符串形式的响应数据
    page_text = response.text
    print(page_text)
    #step 4:持久化储存
    with open('./sougou.html','w',encoding='utf-8')as fp:
        fp.write(page_text)

案例二:搜狗搜索词数量爬取

#搜索词爬虫,kw为自定义输入任意字符
import requests
if __name__ == "__main__":
    #UA伪装:将对应的Use-Agent封装到一个字典中
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
    }
    url = 'https://www.sougou.com/web/'
    #处理url携带的参数:封装到字典中
    kw = input('请输入词语:')
    param = {
        'query':kw
    }
    #对指定的url发起的请求对应的url是携带参数的,并且请求过程中处理了参数
    response = requests.get(url=url,params=param,headers=headers)
    page_text = response.text
    fileName = kw+'html'
    with open(fileName,'w',encoding='utf-8')as fp:
        fp.write(page_text)
        print(fileName,'保存成功')

案例三:百度翻译词语动态爬取

#百度翻译动态实例
import requests
import json
if __name__ == "__main__":
    #指定url
    post_url = "https://fanyi.baidu.com/sug"
    #2.进行UA伪装
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0'
    }
    #3.post请求参数处理(同get请求一致)
    word = input('请输入词语:')
    data = {
        'kw':'word'
    }
    #4.请求发送
    response = requests.post(url=post_url,headers=headers,data=data)
    #5.获取响应数据:json()方法返回的obj(如果确认响应数据是json类型的,才可以使用json,)
    dic_obj = response.json()
    #持久化储存
    fileName = word+'.json'
    fp = open(fileName,'w',encoding='utf-8')
    json.dump(dic_obj,fp=fp,ensure_ascii=False)
    print('保存成功,结束')

案例三:百度翻译词语静态爬取

百度翻译静态实例

import requests
import json
if name == "__main__":

#指定url
post_url = "https://fanyi.baidu.com/sug"
#2.进行UA伪装
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0'
}
#3.post请求参数处理(同get请求一致)
data = {
    'kw':'dog'
}
#4.请求发送
response = requests.post(url=post_url,data=data,headers=headers)
#5.获取响应数据:json()方法返回的obj(如果确认响应数据是json类型的,才可以使用json,)
dic_obj = response.json()
#持久化储存
fileName = word+'.json'
fp = open(fileName,'w',encoding='utf-8')
json.dump(dic_obj,fp=fp,ensure_ascii=False)
print('保存成功,结束')

案例四:豆瓣电影排行榜爬取

#豆瓣电影排行榜爬取
import requests
import json
if __name__ == "__main__":
    url = 'https://movie.douban.com/j/chart/top_list'
    param = {
        'type': '5',
        'interval_id':'100:90',
        'action':'' ,
        'start': '0', #从库中的第几部开始去取
        'limit': '20', #一次取出的个数
    }
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0'
    }
    response = requests.get(url=url,headers=headers,params=param)
    list_data = response.json()
    fp = open('./douban.json','w',encoding='utf-8')
    json.dump(list_data,fp=fp,ensure_ascii=False)
    print('爬取保存成功')

案例五:肯德基地址静态爬取

#肯德基地址爬取,静态爬取,测试成功(无法完成页面批量爬取)
import requests
import json
if __name__ == "__main__":
    post_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
    }
    data = {
        'cname': '',
        'pid': '',
        'keyword': '北京',
        'pageIndex': '1',
        'pageSize': '10',
    }
    response = requests.post(url=post_url, headers=headers, data=data)
    page_text = response.text
    with open('北京', 'w', encoding='utf-8') as fp:
        fp.write(page_text)
    print('爬取成功')
文章目录