You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
67 lines
2.4 KiB
Python
67 lines
2.4 KiB
Python
#coding = utf-8
|
|
|
|
import requests
|
|
import parsel
|
|
import os
|
|
|
|
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
|
|
}
|
|
|
|
def get_oneclass(url_class):
|
|
response_page_class = requests.get(url=url_class,headers=headers)
|
|
selector_class = parsel.Selector(response_page_class.text)
|
|
# url_class_sorts = selector_class.css('.top_nav ul li ::attr(href)').getall()
|
|
# for url_class_sort in url_class_sorts:
|
|
dls = selector_class.css('.chanpin_list dl')
|
|
for dl in dls:
|
|
dl_url = dl.css('dt a ::attr(href)').get()
|
|
global dl_name
|
|
dl_name = dl.css('dt a ::attr(alt)').get() #创建文件夹名
|
|
if not os.path.exists('img\\' + dl_name): # 如果该路径下没有该文件夹
|
|
os.mkdir('img\\' + dl_name)
|
|
get_onepeple(dl_url)
|
|
|
|
def get_onepeple(url):
|
|
response_page = requests.get(url=url,headers=headers)
|
|
selector = parsel.Selector(response_page.text)
|
|
imgs = selector.css('.neirong_body p strong img')
|
|
for img in imgs:
|
|
img_url = img.css('img ::attr(src)').get()
|
|
img_name = img_url.split("/")[-1]
|
|
# suffix = img_url.split(".")[-1]
|
|
# print(img_url)
|
|
|
|
response_page_1 = requests.get(url=img_url,headers=headers).content
|
|
with open(f"img\\{dl_name}\\{img_name}", mode='wb') as f:
|
|
f.write(response_page_1)
|
|
print("保存完成:", img_name)
|
|
|
|
|
|
url_list_1 = []
|
|
|
|
url_aepnu = "http://www.xunfangimg.com/aepnu/list_1.html"
|
|
url_z7qnv_1 = 'http://www.xunfangimg.com/z7qnv/list_1.html'
|
|
url_z7qnv_2 = 'http://www.xunfangimg.com/z7qnv/list_2.html'
|
|
url_zp7sg_1 = 'http://www.xunfangimg.com/zp7sg/list_1.html'
|
|
url_full_1 = "http://www.xunfangimg.com/"
|
|
|
|
url_list_2 = [url_aepnu,url_z7qnv_1,url_z7qnv_2,url_zp7sg_1,url_full_1]
|
|
|
|
response_full = requests.get(url=url_full_1,headers=headers)
|
|
selector_full = parsel.Selector(response_full.text)
|
|
lis_full = selector_full.css('.top_nav ul li ')
|
|
for li_full in lis_full:
|
|
url_class_1 = li_full.css('a ::attr(href)').get()
|
|
# print(url_class_1)
|
|
url_list_1.append(url_class_1)
|
|
|
|
url_list = url_list_1 + url_list_2
|
|
# print(url_list)
|
|
for url in url_list:
|
|
print(url + "开始爬取" )
|
|
try:
|
|
get_oneclass(url)
|
|
except:
|
|
print("爬取失败") |