# coding: utf-8
import re
import os
import time
import threading
from multiprocessing import Pool, cpu_count
import requests
from bs4 import BeautifulSoup
HEADERS = {
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Referer': 'http://www.mzitu.com'
}
save_path='D:\data\crawl\meizitujk\\'
base_url = 'http://m.772586.com'
lock = threading.Lock() # 全局资源锁
def urls_crawler_index(url):
"""
爬虫入口,主要爬取操作
"""
try:
html = requests.get(url, headers=HEADERS, timeout=50)
html.encoding = 'utf-8'
soup = BeautifulSoup(html.text, "lxml", from_encoding='utf-8')
tags =soup.find('ul',class_='tag').find_all('li')
allIndexList = []
allIndexList.append(base_url+'/meinv/')
allIndexList.append(base_url+'/mxmn/')
for tag in tags:
print tag.find('a')['href']
allIndexList.append(base_url+tag.find('a')['href'])
print "========"
for index_detail_url in allIndexList:
urls_crawler_index_list(index_detail_url)
except Exception as e:
print(e)
def urls_crawler_index_list(index_url):
"""
爬虫入口,主要爬取操作
"""
try:
html = requests.get(index_url, headers=HEADERS, timeout=10)
html.encoding = 'utf-8'
soup = BeautifulSoup(html.text, "lxml", from_encoding='utf-8')
max_page_txt = soup.find('div',class_='page').find('a',class_='allpage').text
max_page=max_page_txt[max_page_txt.rfind('/')+1:]
max_page=int(max_page)
print(max_page)
for xxx_page in xrange(max_page):
if 0==xxx_page:
craw_content_page_url=index_url+'index.html'
else:
craw_content_page_url = index_url + 'index_' + str(xxx_page+1) + '.html'
html_content_page = requests.get(craw_content_page_url, headers=HEADERS, timeout=10)
html_content_page.encoding = 'utf-8'
soup_content_page = BeautifulSoup(html_content_page.text, "lxml", from_encoding='utf-8')
index_page_details =soup_content_page.find('div',id='list').find('ul').find_all('li')
detail_urls=[]
for page_url_li in index_page_details:
detail_crawl_url = base_url + page_url_li.find('a')['href']
print("detail_crawl_url:" + detail_crawl_url)
detail_urls.append(detail_crawl_url)
try:
pool.map(urls_crawler_page, detail_urls)
except Exception:
time.sleep(30)
pool.map(urls_crawler_page, detail_urls)
except Exception as e:
print(e)
def urls_crawler_page(url):
"""
爬虫入口,主要爬取操作
"""
try:
response = requests.get(url, headers=HEADERS, timeout=50)
response.encoding = 'utf-8'
r=response.text
folder_name = BeautifulSoup(r, 'lxml').find(
'div', class_="contimglist").find('img')['alt'].replace("?", " ")
print(folder_name)
with lock:
# 套图里图片张数
max_count = BeautifulSoup(r, 'lxml').find(
'div', class_='page').find_all(
'span')[0].next_sibling[1:]
page_urls = []
for i in range(1, int(max_count) + 1):
if i==1:
page_urls.append(url[0:url.rfind('.html')] +'.html')
else:
page_urls.append(url[0:url.rfind('.html')] +'_'+str(i)+'.html')
url_detail_crawler(page_urls,folder_name)
except Exception as e:
print(e)
def url_detail_crawler(page_urls,folder_name):
img_urls = []
for i,page_url in enumerate(page_urls):
time.sleep(0.25)
result = requests.get(page_url, headers=HEADERS, timeout=10).text
img_url = BeautifulSoup(result, 'lxml').find(
'div', class_="contimglist").find('a').find('img')['src']
img_urls.append(img_url)
for cnt, url in enumerate(img_urls):
save_pic(url, cnt,folder_name)
def save_pic(pic_src, pic_cnt,folder_name):
"""
保存图片到本地
"""
try:
time.sleep(0.10)
base_path = save_path + folder_name
if not os.path.isdir(base_path):
os.mkdir(base_path)
img = requests.get(pic_src, headers=HEADERS, timeout=10)
img_name = base_path+"\\"+"pic_cnt_{}.jpg".format(pic_cnt + 1)
with open(img_name, 'ab') as f:
f.write(img.content)
print(img_name)
except Exception as e:
print(e)
if __name__ == "__main__":
# urls = ['http://m.772586.com/qingchun/17454.html']
# pool = Pool(processes=cpu_count())
# try:
# pool.map(urls_crawler_page, urls)
# except Exception:
# time.sleep(30)
# pool.map(urls_crawler_page, urls)
#测试
#
# urls_crawler(url)
# page_urls=['http://m.772586.com/qingchun/17454_2.html']
# urls_crawler(page_urls)
#从首页开始
pool = Pool(processes=10)
urls = 'http://m.772586.com/'
urls_crawler_index(urls)
-
学习
171 引用 • 512 回帖
“梦想从学习开始,事业从实践起步” —— 习近平
欢迎来到这里!
我们正在构建一个小众社区,大家在这里相互信任,以平等 • 自由 • 奔放的价值观进行分享交流。最终,希望大家能够找到与自己志同道合的伙伴,共同成长。
注册 关于