# coding: utf-8 import re import os import time import threading from multiprocessing import Pool, cpu_count import requests from bs4 import BeautifulSoup HEADERS = { 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Referer': 'http://www.mzitu.com' } save_path='D:\data\crawl\meizitujk\\' base_url = 'http://m.772586.com' lock = threading.Lock() # 全局资源锁 def urls_crawler_index(url): """ 爬虫入口,主要爬取操作 """ try: html = requests.get(url, headers=HEADERS, timeout=50) html.encoding = 'utf-8' soup = BeautifulSoup(html.text, "lxml", from_encoding='utf-8') tags =soup.find('ul',class_='tag').find_all('li') allIndexList = [] allIndexList.append(base_url+'/meinv/') allIndexList.append(base_url+'/mxmn/') for tag in tags: print tag.find('a')['href'] allIndexList.append(base_url+tag.find('a')['href']) print "========" for index_detail_url in allIndexList: urls_crawler_index_list(index_detail_url) except Exception as e: print(e) def urls_crawler_index_list(index_url): """ 爬虫入口,主要爬取操作 """ try: html = requests.get(index_url, headers=HEADERS, timeout=10) html.encoding = 'utf-8' soup = BeautifulSoup(html.text, "lxml", from_encoding='utf-8') max_page_txt = soup.find('div',class_='page').find('a',class_='allpage').text max_page=max_page_txt[max_page_txt.rfind('/')+1:] max_page=int(max_page) print(max_page) for xxx_page in xrange(max_page): if 0==xxx_page: craw_content_page_url=index_url+'index.html' else: craw_content_page_url = index_url + 'index_' + str(xxx_page+1) + '.html' html_content_page = requests.get(craw_content_page_url, headers=HEADERS, timeout=10) html_content_page.encoding = 'utf-8' soup_content_page = BeautifulSoup(html_content_page.text, "lxml", from_encoding='utf-8') index_page_details =soup_content_page.find('div',id='list').find('ul').find_all('li') detail_urls=[] for page_url_li in index_page_details: detail_crawl_url = base_url + page_url_li.find('a')['href'] print("detail_crawl_url:" + detail_crawl_url) detail_urls.append(detail_crawl_url) try: pool.map(urls_crawler_page, detail_urls) except Exception: time.sleep(30) pool.map(urls_crawler_page, detail_urls) except Exception as e: print(e) def urls_crawler_page(url): """ 爬虫入口,主要爬取操作 """ try: response = requests.get(url, headers=HEADERS, timeout=50) response.encoding = 'utf-8' r=response.text folder_name = BeautifulSoup(r, 'lxml').find( 'div', class_="contimglist").find('img')['alt'].replace("?", " ") print(folder_name) with lock: # 套图里图片张数 max_count = BeautifulSoup(r, 'lxml').find( 'div', class_='page').find_all( 'span')[0].next_sibling[1:] page_urls = [] for i in range(1, int(max_count) + 1): if i==1: page_urls.append(url[0:url.rfind('.html')] +'.html') else: page_urls.append(url[0:url.rfind('.html')] +'_'+str(i)+'.html') url_detail_crawler(page_urls,folder_name) except Exception as e: print(e) def url_detail_crawler(page_urls,folder_name): img_urls = [] for i,page_url in enumerate(page_urls): time.sleep(0.25) result = requests.get(page_url, headers=HEADERS, timeout=10).text img_url = BeautifulSoup(result, 'lxml').find( 'div', class_="contimglist").find('a').find('img')['src'] img_urls.append(img_url) for cnt, url in enumerate(img_urls): save_pic(url, cnt,folder_name) def save_pic(pic_src, pic_cnt,folder_name): """ 保存图片到本地 """ try: time.sleep(0.10) base_path = save_path + folder_name if not os.path.isdir(base_path): os.mkdir(base_path) img = requests.get(pic_src, headers=HEADERS, timeout=10) img_name = base_path+"\\"+"pic_cnt_{}.jpg".format(pic_cnt + 1) with open(img_name, 'ab') as f: f.write(img.content) print(img_name) except Exception as e: print(e) if __name__ == "__main__": # urls = ['http://m.772586.com/qingchun/17454.html'] # pool = Pool(processes=cpu_count()) # try: # pool.map(urls_crawler_page, urls) # except Exception: # time.sleep(30) # pool.map(urls_crawler_page, urls) #测试 # # urls_crawler(url) # page_urls=['http://m.772586.com/qingchun/17454_2.html'] # urls_crawler(page_urls) #从首页开始 pool = Pool(processes=10) urls = 'http://m.772586.com/' urls_crawler_index(urls)
-
学习
171 引用 • 513 回帖
“梦想从学习开始,事业从实践起步” —— 习近平
欢迎来到这里!
我们正在构建一个小众社区,大家在这里相互信任,以平等 • 自由 • 奔放的价值观进行分享交流。最终,希望大家能够找到与自己志同道合的伙伴,共同成长。
注册 关于