通过 python 读取 excel 表格内容 url 批量下载

本贴最后更新于 1800 天前,其中的信息可能已经事过景迁

现有表格内容如下:
image.png
BT 列有全部图片 url 地址:

最终版本优化

#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time  : 2019/10/15 21:36
# @Author : cuijianzhe
# @File  : biaoge.py
# @Software: PyCharm

import xlrd
import requests
import os

path = 'test.xls'
workbook = xlrd.open_workbook(path)
Data_sheet = workbook.sheets()[0]
rowNum = Data_sheet.nrows #行数
colNum = Data_sheet.ncols #列数

def get_sitesname():
    '''    提取并添加站点名称    '''
    #第一个方法使用列表
    sites_list = []
    i = 1    #从1开始跳过表头数据
    try:
        for s in range(Data_sheet.ncols):
            if (Data_sheet.cell_value(0, s)) == '站点名称':
                while i <= rowNum:
                    if Data_sheet.cell_value(i,s) != "":
                        sites_list.append(Data_sheet.cell_value(i,s)+"_"+str(i))
                    i += 1
    except:
        pass
    return sites_list
    #第二个方法使用列表推导式直接返回
    # return [Data_sheet.cell_value(i, 3) + "_" + str(i) for i in range(1,2) if i not in [] for i in range(1,rowNum-2) if Data_sheet.cell_value(i, 3)!= ""]
def get_url():
    '''    提取链接所有内容    提取对应行列的内容    '''
    url_list = []
    try:
        for b in range(Data_sheet.ncols):
            if (Data_sheet.cell_value(0,b)) == '图片地址':
                for row in range(1,rowNum-2):
                    if Data_sheet.cell_value(row,b) != "":
                        url_list.append(Data_sheet.cell_value(row, b).replace(';', '\n'))
                    else:
                        url_list.append('https://file.cjzshilong.cn/pictures_file/guohui-e67e7b3b.png\n')
    except:
        pass

    return url_list
def Download(url,site_name):
    '''    构建目录   批量下载图片家目录有问题。待解决优化    '''
    for num in range(len(url)):
        name = site_name[num]
        os.mkdir(name)
        os.chdir(name)
        file_list = url[num].split('\n')
        with open('./' + name[:-2] + '图片url_共计'+ str(len(file_list)-1) + '条.txt', 'w', encoding='utf-8') as f:
            f.write(url[num])
        d = 0
        for url_1 in file_list:
            if url_1 != '':
                res = requests.get(url_1)
                with open(name[:-2] +'_'+ str(d+1) + '.jpg', 'wb') as p:
                     p.write(res.content)
                print('%s下载完毕' %name)
                d += 1
        os.chdir('..')

if __name__ == '__main__':
    url_ss = get_url()
    site_name = get_sitesname()
    Download(url_ss,site_name)

第三版:

  • 根据图片找到对应站点名称
  • 每个站点生成单独的目录
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time  : 2019/10/15 21:36
# @Author : cuijianzhe
# @File  : biaoge.py
# @Software: PyCharm
import xlrd
import requests
import os
path = 'test.xls'
workbook = xlrd.open_workbook(path)
Data_sheet = workbook.sheets()[0] #代表表格中sheet 0表格内容
rowNum = Data_sheet.nrows #行数
colNum = Data_sheet.ncols #列数

'''
提取单元格所有内容
'''
# print(Data_sheet.cell_value(1,4))   #第1行第4列单元格内容
# print(Data_sheet.ncols)  #有效列数

sites_list = []
def get_sitesname():
    '''
    提取并添加站点名称
    '''
    i = 0
    try:
        while i <= rowNum:
            sites_list.append(Data_sheet.cell_value(i,3))
            i += 1
    except:
        pass
    del sites_list[0]
    del sites_list[-1]
    del sites_list[-1]
    '''
    所有站点名称加上后缀,达到不重复效果
    '''
    s = 0
    for i in sites_list:
        s += 1
        weizhi = sites_list.index(i)
        sites_list[weizhi] = i + '_' + str(s)
    return sites_list
def get_url():
    '''
    提取链接所有内容
    提取对应行列的内容
    '''
    name_col = '图片地址'
    url_list = []
    try:
        for b in range(Data_sheet.ncols):
            if (Data_sheet.cell_value(0,b)) == name_col:
                for col in range(colNum):
                    url_data = Data_sheet.cell_value(col,b)
                    url_data2 = url_data.replace(';','\n')
                    url_list.append(url_data2)
    except:
        pass
    del url_list[0]
    del url_list[-1]
    del url_list[-1]
    '''
    把列表中元素为空的值替换成自己定义的
    '''
    for i in range(int(len(url_list))):
        if url_list[i] == '':
            url_list[i] = 'https://file.cjzshilong.cn/pictures_file/guohui-e67e7b3b.png'
    return url_list

def Download(url,site_name):
    '''
    构建目录
    批量下载图片家目录有问题。待解决优化
    '''
    c = 0
    for num in range(len(url)):
        name = site_name[num]
        os.mkdir(name)
        os.chdir(name)
        with open('./url_' + str(c) + '.txt', 'w', encoding='utf-8') as f:
            f.write(url[num])
        with open('./url_' + str(c) + '.txt', 'r', encoding='utf-8') as d:
            file = d.read()
            file_list = file.split('\n')
            name = site_name[num]
            c += 1
            d = 0
            for url_1 in file_list:
                if url_1 == '':
                    break
                res = requests.get(url_1)
                print(('%s下载完毕') % (name))
                with open(name + str(d) + '.jpg', 'wb') as p:
                    p.write(res.content)
                d += 1
        os.chdir('..')

if __name__ == '__main__':
    url_List = get_url()
    site_name = get_sitesname()
    Download(url_List,site_name)

1.2 异常捕获

表格中一个单元格中有一个 url 或者多个 url 或者空白,解决不识别情况,str 转 list 解决

#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Time  : 2019/10/15 21:36
# @Author : cuijianzhe
# @File  : biaoge.py
# @Software: PyCharm
import xlrd
import requests
import random
import string
import os
import time
path = 'test.xls'
date = time.strftime('%Y-%m-%d',time.localtime())
workbook = xlrd.open_workbook(path)
# print(workbook.sheet_names())
Data_sheet = workbook.sheets()[0]
# print(Data_sheet.name)
rowNum = Data_sheet.nrows #行数
colNum = Data_sheet.ncols #列数
'''
提取单元格所有内容
'''
list = []
for i in range(rowNum):
    rowlist = []
    for j in range(colNum):
        rowlist.append(Data_sheet.cell_value(i,j))
        list.append(rowlist)

'''
提取对应行列的内容
'''
#print(Data_sheet.cell_value(0,43))   #列名称
#print(Data_sheet.ncols)  #有效列数
name_col = '图片地址'
url_list = []
try:
    for b in range(Data_sheet.ncols):
        if (Data_sheet.cell_value(0,b)) == name_col:
            for col in range(colNum):
                url_data = Data_sheet.cell_value(col,b)
                url_data2 = url_data.replace(';','\n')
                url_list.append(url_data2)
except:
    pass

del url_list[0]
with open('./url.txt','w',encoding='utf-8') as f:
    for url in url_list:
        f.write(str(url))
        # f.write(',')
with open('./url.txt','r',encoding='utf-8') as d:
    file = d.read()
    file_list = file.split('\n')
del file_list[-1]

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
name = 'pictures'
os.mkdir(name)
os.chdir(name)
for url_1 in file_list:
    name_1 = ''.join(random.sample(string.ascii_letters + string.digits, 8))
   #name_1 = url_1.replace('/','')
    res = requests.get(url_1,headers=headers)
    print(('%s下载完毕')%(name_1))
    with open(name_1 + '.jpg','wb') as p:
        p.write(res.content)

如图:
image.png

打包 exe 文件

pip install pyinstaller
  • pyinstaller 的使用
pyinstaller -F biaoge.py
  • 找到文件
    image.png

参考:
pyinstaller 官方文档:
xlrd 官方文档:
https://www.cnblogs.com/insane-Mr-Li/p/9092619.html

  • Python

    Python 是一种面向对象、直译式电脑编程语言,具有近二十年的发展历史,成熟且稳定。它包含了一组完善而且容易理解的标准库,能够轻松完成很多常见的任务。它的语法简捷和清晰,尽量使用无异义的英语单词,与其它大多数程序设计语言使用大括号不一样,它使用缩进来定义语句块。

    545 引用 • 672 回帖

相关帖子

欢迎来到这里!

我们正在构建一个小众社区,大家在这里相互信任,以平等 • 自由 • 奔放的价值观进行分享交流。最终,希望大家能够找到与自己志同道合的伙伴,共同成长。

注册 关于
请输入回帖内容 ...