import os
import logging
def process_file(file_name):
# 配置日志
logging.basicConfig(filename='file_processing.log', level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
file_path = f'data/{file_name}.txt'
# 检查文件是否存在
if os.path.exists(file_path):
logging.info(f"文件 '{file_name}' 存在,正在读取...")
try:
with open(file_path, 'r') as file:
data = file.read()
logging.info(f"文件读取成功,内容为: {data}")
return data # 返回文件内容
except Exception as e:
logging.error(f"读取文件时出现错误: {e}")
else:
logging.info(f"文件 '{file_name}' 不存在,正在创建...")
try:
with open(file_path, 'w') as file:
file.write("这是一个新创建的文件")
logging.info(f"文件创建成功")
return "这是一个新创建的文件" # 返回新创建的文件内容
except Exception as e:
logging.error(f"创建文件时出现错误: {e}")
# 调用函数进行文件处理并获取文件内容
file_content = process_file("001")
print(file_content) # 打印文件内容
理论上,代码实现会因网站结构、需要抓取的信息等而异。以下是一个简单的示例,使用requests
和BeautifulSoup
库:
import requests
from bs4 import BeautifulSoup
import hashlib
import os
def get_website_content(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(f"Failed to fetch the website content. Status code: {response.status_code}")
return None
def save_data_to_file(data, filename):
with open(filename, 'w', encoding='utf-8') as file:
file.write(data)
def read_data_from_file(filename):
if os.path.exists(filename):
with open(filename, 'r', encoding='utf-8') as file:
return file.read()
return None
def update_website_info(url, filename):
current_content = get_website_content(url)
if current_content:
# Calculate hash of the new content
current_hash = hashlib.md5(current_content.encode()).hexdigest()
# Read the previous hash from file
previous_hash = read_data_from_file(filename)
if current_hash == previous_hash:
# Website content hasn't changed, use the local data
print("Website content hasn't changed. Using local data.")
local_data = read_data_from_file('local_data.txt')
return local_data
else:
# Website content has changed, update and save the new data
print("Website content has changed. Updating and saving new data.")
save_data_to_file(current_content, filename)
return current_content
# 示例用法
url = 'https://example.com'
filename = 'website_data_hash.txt'
updated_data = update_website_info(url, filename)
# 使用updated_data进行后续操作
在上面的示例中,通过计算网站内容的哈希值来检查网站是否更新。具体来说,使用hashlib.md5()
计算当前网站内容的MD5哈希值,并将其与上次保存的哈希值进行比较。如果两者相等,说明网站内容没有更改,可以直接使用本地数据,否则就表示网站内容已更新,需要获取最新数据。
这种方法的前提是,当网站内容发生变化时,其哈希值也会随之变化。然而,这不是绝对准确的,因为一些网站可能会在内容更新时仅仅修改页面的某些部分,而不改变整个页面的哈希值。在实际应用中,你可能需要根据具体情况选择更精细的比对方式或者使用其他手段来检测网站的更新。