数据集:网上下载的公共数据
环境:Python3.5
实现:找到了C1 L1 C2 L2
存在的不足:只能找到指定长度候选K项级和频繁K项级,没有实现找到存在的所有K项级,但实现原理相同
数据格式如下:
9900000988 07004 水晶梨 1
9900000989 03001 300g壶瓶枣 1
9900000989 03002 484g壶瓶枣 1
9900000989 03004 2000g壶瓶枣礼盒一 1
...
9900000989 06002 夹子 2
9900000989 06003 蜻蜓扑克 1
9900000989 04002 800g沁州黄 1
9900000989 04004 5斤布袋沁州黄 1
上代码:
Created on 2018年4月12日
@author: yqm
'''
import os
'''data文件路径'''
file_dir = "./data/data.txt"
'''判断文件是否存在'''
if(os.path.exists(file_dir)):
pass
else:
raise FileNotFoundError("找不到文件" + file_dir)
'''读数据文件 存为列表list'''
data_list = [] # 保存所有订单所有商品
with open(file_dir, encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
list = line.split()
data_list.append(list)
# print(data_list)
'''统计每笔订单购买商品和商品数,每笔订单存为一个词典[{},{}...{}]'''
All_Data_List = [] # 存储每笔订单统计
data_one = "9900000984" #第一个小票号 此号用来唯一标示一笔订单
item = {}
for i, data in enumerate(data_list):
if(data[0]==data_one):
item[data[2]] = int(data[3])
else:
All_Data_List.append(item)
data_one = data[0]
item = {}
item[data[2]] = int(data[3])
# print(All_Data_List)
'''统计每个商品的数量'''
goods_num_Statistics = {}
for list in data_list:
if list[2] in goods_num_Statistics:
goods_num_Statistics[list[2]] += 1
else:
goods_num_Statistics[list[2]] = 1
print("C1 = " + str(goods_num_Statistics))
'''根据C1结果删除数量小于2的商品,输出L1'''
keys_list = []
for key in goods_num_Statistics.keys():
keys_list.append(key)
for key in keys_list:
if goods_num_Statistics[key] < 2:
del goods_num_Statistics[key]
print("L1 = " + str(goods_num_Statistics))
'''根据L1结果计算C2'''
C2_item = [] # 存放组合结果,不包含数量
C2_keys = [] # goods_num_Statistics字典所有的key值
for key in goods_num_Statistics.keys():
C2_keys.append(key)
for name1 in C2_keys:
for name2 in C2_keys:
C2_item.append([name1, name2])
# print(C2_item)
# print(len(C2_item))
# 去重
for item in C2_item:
a = [item[0], item[1]]
b = [item[1], item[0]]
if a in C2_item and b in C2_item:
aa = C2_item.index(a)
del C2_item[aa]
for item1 in C2_item:
for item2 in C2_item:
if item1 == item2:
bb = C2_item.index(item1)
del C2_item[bb]
# print("C2_item" + str(C2_item))
# print(len(C2_item))
'''根据去重后的组合C2_item计算C2结果'''
C2_result = {} # 存放C2结果
C2_item_key = []
for keys in All_Data_List:
list_key = []
for key in keys.keys():
list_key.append(key)
C2_item_key.append(list_key)
# print("C2_item_key" + str(C2_item_key))
for item1 in C2_item:
num = 0
for item2 in C2_item_key:
if set(item1).issubset(item2):
num += 1
item1.append(num)
print("C2 = " + str(C2_item))
'''根据C2筛选出数量大于2的所有集合'''
L2_result = []
for item in C2_item:
if item[2] >= 2:
L2_result.append(item)
print("L2 = " + str(L2_result))
运行结果部分展示:
C1 = {'宁化府十二珍醋': 2, '软中华': 4, '中南海0.8': 5, '大豆': 1, '800g*2壶瓶醉枣礼盒(桶)': 2,...}
L1 = {'宁化府十二珍醋': 2, '软中华': 4, '中南海0.8': 5, '800g*2壶瓶醉枣礼盒(桶)': 2, '牛肉258g': 2,...}
C2 = [['宁化府十二珍醋', '800g*2壶瓶醉枣礼盒(桶)', 0], ['宁化府十二珍醋', '散核桃仁', 0],...]
L2 = [['中南海0.8', '散大核桃', 2], ['牛肉258g', '牛肉258g', 2], ['牛肉258g', '软云', 2], ...]