快速读取csv文件 去空行 自定义字段顺序 编码 日志级别 忽略和重新请求 增加最大线程池
快速读取csv,有进度条。
# “达观杯”csv数据文件读取
import time
import pandas as pd
from tqdm import tqdm
def reader_pandas(file, chunkSize=100000, patitions=10 ** 4):
reader = pd.read_csv(file, iterator=True)
chunks = []
with tqdm(range(patitions), 'Reading ...') as t:
for _ in t:
try:
chunk = reader.get_chunk(chunkSize)
chunks.append(chunk)
except StopIteration:
break
return pd.concat(chunks, ignore_index=True)
print(reader_pandas("./data/train_set.csv"))
if __name__ == '__main__':
from scrapy import cmdline
cmdline.execute('scrapy crawl Pakistan_thenews'.split())
# cmdline.execute('scrapy crawl Pakistan_thenews -o ./csv_file/Pakistan_thenews_p.csv -t csv'.split())
settings.py
# 自定义字段顺序
FEED_EXPORT_FIELDS = [
'country',
'category',
'data_url',
'title',
'abstract',
'content',
'img_url',
'press_time',
]
# 在同层目录,settings.py文件指定分隔符
# CSV_DELIMITER = '\t'
# 编码
FEED_EXPORT_ENCODING = "gb18030"
# 日志级别
# LOG_LEVEL = 'INFO'
# LOG_LEVEL = 'ERROR'
# LOG_FILE = 'mySpider.log'
# 遇到错误忽略掉不重新请求,则设成[]
# RETRY_HTTP_CODES = []
RETRY_HTTP_CODES = [500, 502, 503, 504, 508, 400, 403, 404, 408, 520]
# 增加最大线程池
REACTOR_THREADPOOL_MAXSIZE = 1
去空行
# scrapy.exporters.CsvItemExporter,在io.TextIOWrapper加入参数newline='',问题解决
class CsvItemExporter(BaseItemExporter):
def __init__(self, file, include_headers_line=True, join_multivalued=',', **kwargs):
self._configure(kwargs, dont_fail=True)
if not self.encoding:
self.encoding = 'utf-8'
self.include_headers_line = include_headers_line
self.stream = io.TextIOWrapper(
file,
newline='',
line_buffering=False,
write_through=True,
encoding=self.encoding
) if six.PY3 else file
self.csv_writer = csv.writer(self.stream, **kwargs)
self._headers_not_written = True