关于下载文件[只做记录,如有问题请斧正]

具体需求:从网站下载文件保存文件名,本地路径,[由于去重策略使用下载url的id进行去重]

scrapy download: 文件的时候无法将本地路径存储到mysql数据库
wget:在现在文件的时候存储的文件名只是UrlEncode编码之后的一个字符串,很可能重复;
就像这种:%e8%bf%99%e6%98%af%e4%b8%80%e4%b8%aa%e6%96%87%e4%bb%b6%e5%90%8d.pdf
urlretrieve: 引入 --> from urllib.request import urlretrieve,可能由于库比较古老,在下载大文件的时候会出现死掉的情况,虽然参考了网上大佬的代码,使用socket去避免,但还是效果不好;
代码:
//downloadfiles是一个文件下载地址和文件名对应的字典
if download_files == {}:
return
file_names = []
try:
for d in download_files.keys():
# print(d)
if 'ftp' in d:
continue
# print(d)
temp = requests.get(d, headers=cls.headers)
if 'Content-Disposition' in temp.headers.keys():
suffix = unquote(
temp.headers['Content-Disposition'].split('=')[-1]).split('.')[-1]
file_title = FILE_STORE + d.split('=')[-1] + '.' + suffix
urlretrieve(d, file_title)
file_names.append(file_title)
elif len(d.split('.')[-1]) < 5:
if d.split('=')[-1] == d:
file_title = FILE_STORE + d.split('/')[-1]
urlretrieve(d, file_title)
file_names.append(file_title)
continue
suffix = '.' + d.split('.')[-1]
file_title = FILE_STORE + d.split('=')[-1] + '.' + suffix
urlretrieve(d, file_title)
file_names.append(file_title)
cls.logger.debug(
'elif_download_files: {},file_title: {}'.format(
download_files, file_title),)

            else:
                res = requests.get(d,headers=cls.headers)
                if '系统出现异常' in res.text:
                    cls.logger.debug('else_if_err: {}'.format(res.text))
                    continue
                # print(file_title)
                # os.system("pause")
                suffix = '.' + d.split('.')[-1]
                file_title = FILE_STORE + d.split('=')[-1] + '.' + suffix
                urlretrieve(d, file_title)
                file_names.append(file_title)
                cls.logger.debug(
                    'else_download_files: {},file_title: {}'.format(
                        download_files, file_title),)
        return file_names
    except socket.timeout:
        count = 1
        while count <= 3:
            try:
                for d in download_files.keys():
                    if 'ftp' in d:
                        continue
                    temp = requests.get(d)
                    if 'Content-Disposition' in temp.headers.keys():
                        # print(temp.headers['Content-Disposition'].split('=')[-1])
                        file_title = unquote(
                            temp.headers['Content-Disposition'].split('=')[-1])
                        file_title = FILE_STORE + file_title
                        print(file_title)
                        urlretrieve(d, file_title)
                        file_names.append(file_title)
                    elif len(d.split('.')[-1]) < 5:
                        if d.split('=')[-1] == d:
                            file_title = FILE_STORE + d.split('/')[-1]
                            urlretrieve(d, file_title)
                            file_names.append(file_title)
                            continue
                        suffix = '.' + d.split('.')[-1]
                        file_title = FILE_STORE + d.split('=')[-1] + '.' + suffix
                        urlretrieve(d, file_title)
                        file_names.append(file_title)
                        cls.logger.debug(
                            'elif_download_files: {},file_title: {}'.format(
                                download_files, file_title), )
                    else:
                        res = requests.get(d, headers=cls.headers)
                        if '系统出现异常' in res.text:
                            cls.logger.debug('else_if_err: {}'.format(res.text))
                            continue
                        suffix = '.' + d.split('.')[-1]
                        file_title = FILE_STORE + d.split('=')[-1] + '.' + suffix
                        urlretrieve(d, file_title)
                        file_names.append(file_title)
                        cls.logger.debug(
                            'else_download_files: {},file_title: {}'.format(
                                download_files, file_title), )
                return file_names

            except socket.timeout:
                err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count
                print(err_info)
                count += 1
        if count > 5:
            print("downloading  fialed!")

requests:在下载文件的时候需要制定stream参数,我这里指定的是一次1024k;
代码:
file_names = []
try:
for d in download_files.keys():
if 'ftp' in d:
continue
temp = requests.get(d, headers=cls.headers)
if 'Content-Disposition' in temp.headers.keys():
suffix = unquote(temp.headers['Content-Disposition'].split('=')[-1]).split('.')[-1]
file_title = FILE_STORE + d.split('=')[-1] + '.' + suffix
with closing(requests.get(
url=d,
verify=False, stream=True)) as res:
# file_title = FILE_STORE + file_title
with open(file_title, 'wb') as fd:
print('下载新的……')
for chunk in res.iter_content(chunk_size=1024):
if chunk:
fd.write(chunk)
file_names.append(file_title)
elif len(d.split('.')[-1]) < 5:
if d.split('=')[-1] == d:
file_title = FILE_STORE + d.split('/')[-1]
with closing(requests.get(
url=d,
verify=False, stream=True)) as res:
# file_title = FILE_STORE + file_title
with open(file_title, 'wb') as fd:
print('下载新的……')
for chunk in res.iter_content(chunk_size=1024):
if chunk:
fd.write(chunk)
file_names.append(file_title)
continue
suffix = '.' + d.split('.')[-1]
file_title = FILE_STORE + d.split('=')[-1] + '.' + suffix
with closing(requests.get(
url=d,
verify=False, stream=True)) as res:
# file_title = FILE_STORE + file_title
with open(file_title, 'wb') as fd:
print('下载新的……')
for chunk in res.iter_content(chunk_size=1024):
if chunk:
fd.write(chunk)
file_names.append(file_title)
cls.logger.debug(
'elif_download_files: {},file_title: {}'.format(
download_files, file_title),)

            else:
                res = requests.get(d,headers=cls.headers)
                if '系统出现异常' in res.text:
                    cls.logger.debug('else_if_err: {}'.format(res.text))
                    continue
                # print(file_title)
                # os.system("pause")
                suffix = '.' + d.split('.')[-1]
                file_title = FILE_STORE + d.split('=')[-1] + '.' + suffix
                with closing(requests.get(
                        url=d,
                        verify=False, stream=True)) as res:
                    # file_title = FILE_STORE + file_title
                    with open(file_title, 'wb') as fd:
                        print('下载新的……')
                        for chunk in res.iter_content(chunk_size=1024):
                            if chunk:
                                fd.write(chunk)
                    file_names.append(file_title)
                cls.logger.debug(
                    'else_download_files: {},file_title: {}'.format(
                        download_files, file_title),)
        return file_names
    except Exception as e:
        print(e)
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容