Expedia数据挖掘(Kaggle比赛)

1.使用了SGDLR和Random Forest俩种方法
2.结合data leakage
3.最终得分49.999,在kaggle排行榜中能排到104位(共1700多队)
1.leakage solution

# -*- coding: utf-8 -*-
from heapq import nlargest
from operator import itemgetter

def leakage_deal():
    f=open("train.csv", "r")
    f.readline()    
    best_hotels_odd_ulc={}
    best_hotels_miss_odd={}
    best_h00={}
    best_h01={}
    count=0
    #counts
    while 1:
        line=f.readline().strip()
        count+=1
        if line == '':
            break
        arr=line.split(",")
        book_year=int(arr[0][:4])
        book_month=int(arr[0][5:7])
        user_location_city=arr[5]
        orig_destination_distance=arr[6]
        user_id=arr[7]
        srch_destination_id=arr[16]
        hotel_country=arr[21]
        hotel_market=arr[22]
        is_booking=float(arr[18])
        hotel_cluster=arr[23]

        relative_ref_month=((book_year-2012)*12+(book_month-12))
        append_weight=relative_ref_month*relative_ref_month*(3+17.60*is_booking)

        if user_location_city!='' and orig_destination_distance!='' and user_id!='' and srch_destination_id!= '' and hotel_country!= '':
            s00=hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
            if s00 in best_h00:
                if hotel_cluster in best_h00[s00]:
                    best_h00[s00][hotel_cluster] += append_weight
                else:
                    best_h00[s00][hotel_cluster] = append_weight
            else:
                best_h00[s00] = {}
                best_h00[s00][hotel_cluster] = append_weight

        if user_location_city!='' and orig_destination_distance!='' and user_id!='' and srch_destination_id!= '':
            s01=hash(str(user_id)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
            if s01 in best_h01:
                if hotel_cluster in best_h01[s01]:
                    best_h01[s01][hotel_cluster]+=append_weight
                else:
                    best_h01[s01][hotel_cluster]=append_weight
            else:
                best_h01[s01]={}
                best_h01[s01][hotel_cluster]=append_weight


        if user_location_city!= '' and orig_destination_distance=='' and user_id!='' and srch_destination_id!='' and hotel_country!='':
            s0 = hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
            if s0 in best_hotels_miss_odd:
                if hotel_cluster in best_hotels_miss_odd[s0]:
                    best_hotels_miss_odd[s0][hotel_cluster]+=append_weight
                else:
                    best_hotels_miss_odd[s0][hotel_cluster]=append_weight
            else:
                best_hotels_miss_odd[s0]={}
                best_hotels_miss_odd[s0][hotel_cluster]=append_weight

        if user_location_city!='' and orig_destination_distance!='':
            s1 = hash(str(user_location_city)+':'+str(orig_destination_distance))

            if s1 in best_hotels_odd_ulc:
                if hotel_cluster in best_hotels_odd_ulc[s1]:
                    best_hotels_odd_ulc[s1][hotel_cluster]+=relative_ref_month
                else:
                    best_hotels_odd_ulc[s1][hotel_cluster]=relative_ref_month
            else:
                best_hotels_odd_ulc[s1]={}
                best_hotels_odd_ulc[s1][hotel_cluster]=relative_ref_month

    f.close()
    return best_h00,best_h01, best_hotels_odd_ulc, best_hotels_miss_odd

def submit(best_h00, best_h01, best_hotels_odd_ulc, best_hotels_miss_odd):
    path='leakage_deal.csv'
    out=open(path, "w")
    f=open("test.csv", "r")
    f.readline()
    count=0
    count0=0
    count00=0
    count1=0
    out.write("id,hotel_cluster\n")
    while 1:
        line=f.readline().strip()
        count+=1
        if count % 100000 == 0:
            print('Write {} lines...'.format(count))
        if line == '':
            break
        arr=line.split(",")
        id=arr[0]
        user_location_city=arr[6]
        orig_destination_distance=arr[7]
        user_id=arr[8]
        srch_destination_id=arr[17]
        hotel_country=arr[20]
        hotel_market=arr[21]
        out.write(str(id) + ',')
        filled=[]
        s1=hash(str(user_location_city)+':'+str(orig_destination_distance))
        if s1 in best_hotels_odd_ulc:
            d=best_hotels_odd_ulc[s1]
            topitems=nlargest(5, sorted(d.items()), key=itemgetter(1))
            for i in range(len(topitems)):
                if topitems[i][0] in filled:
                    continue
                if len(filled) == 5:
                    break
                out.write(' ' + topitems[i][0])
                filled.append(topitems[i][0])
                count1 += 1
        if orig_destination_distance == '':
            s0=hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
            if s0 in best_hotels_miss_odd:
                d=best_hotels_miss_odd[s0]
                topitems=nlargest(4, sorted(d.items()), key=itemgetter(1))
                for i in range(len(topitems)):
                    if topitems[i][0] in filled:
                        continue
                    if len(filled) == 5:
                        break
                    out.write(' ' + topitems[i][0])
                    filled.append(topitems[i][0])
                    count0+=1
        s00=hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
        s01=hash(str(user_id)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
        if s01 in best_h01 and s00 not in best_h00:
            d=best_h01[s01]
            topitems=nlargest(4, sorted(d.items()), key=itemgetter(1))
            for i in range(len(topitems)):
                if topitems[i][0] in filled:
                    continue
                if len(filled) == 5:
                    break
                out.write(' ' + topitems[i][0])
                filled.append(topitems[i][0])
                count00 += 1
        out.write("\n")
    out.close()
    print('count 1=',count1)
    print('count 0=',count0)
    print('count 00=',count00)

best_h00,best_h01, best_hotels_odd_ulc, best_hotels_miss_odd = leakage_deal()
submit(best_h00, best_h01, best_hotels_odd_ulc, best_hotels_miss_odd)

2.Random Forest

# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import h5py
from sklearn.ensemble import RandomForestClassifier

def pre_deal(data):
    '''data_pre_deal'''
    try:
        data.loc[data.srch_ci.str.endswith('00'),'srch_ci'] = '2015-12-31'
        data['srch_ci'] = data.srch_ci.astype(np.datetime64)
        data.loc[data.date_time.str.endswith('00'),'date_time'] = '2015-12-31'
        data['date_time'] = data.date_time.astype(np.datetime64)
    except:
        pass
    data.fillna(0, inplace=True)
    #calculate the duration in hotel
    data['live_in_days'] = data.srch_co-data.srch_ci
    data['live_in_days'] = data['live_in_days'].apply(lambda ts: ts/np.timedelta64(1,'D'))
    #calculate the time from book to live in the hotel
    data['date_to_live_days'] = data.srch_ci-data.date_time
    data['date_to_live_days'] = data['date_to_live_days'].apply(lambda ts: ts/np.timedelta64(1,'D'))
    data['ci_month'] = data['srch_ci'].apply(lambda dt: dt.month)
    data['ci_day'] = data['srch_ci'].apply(lambda dt: dt.day)
    data['date_month'] = data['date_time'].apply(lambda dt: dt.month)
    data['date_day'] = data['date_time'].apply(lambda dt: dt.day)
    data['date_hour'] = data['date_time'].apply(lambda dt: dt.hour)
    data.drop(['date_time', 'user_id', 'srch_ci', 'srch_co'], axis=1, inplace=True)
import os
if os.path.exists('srch_dest_hc_hm_agg.csv'):
    agg1 = pd.read_csv('srch_dest_hc_hm_agg.csv')
else:
    reader = pd.read_csv('train.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=200000)
    pieces = [chunk.groupby(['srch_destination_id','hotel_country','hotel_market','hotel_cluster'])['is_booking'].agg(['sum','count']) for chunk in reader]
    agg = pd.concat(pieces).groupby(level=[0,1,2,3]).sum()
    agg.dropna(inplace=True)
    agg['sum_and_cnt'] = 0.85*agg['sum'] + 0.15*agg['count']
    agg = agg.groupby(level=[0,1,2]).apply(lambda x: x.astype(float)/x.sum())
    agg.reset_index(inplace=True)
    agg1 = agg.pivot_table(index=['srch_destination_id','hotel_country','hotel_market'], columns='hotel_cluster', values='sum_and_cnt').reset_index()
    agg1.to_csv('srch_dest_hc_hm_agg.csv', index=False)
    #clean memory
    del pieces,agg

destinations = pd.read_csv('destinations.csv')
submission = pd.read_csv('sample_submission.csv')

clf=RandomForestClassifier(n_estimators=100, n_jobs=-1, warm_start=True)
count=0
chunksize=200000
reader = pd.read_csv('train.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
for chunk in reader:
    try:
        chunk = chunk[chunk.is_booking==1]
        chunk = pd.merge(chunk, destinations, how='left', on='srch_destination_id')
        chunk = pd.merge(chunk, agg1, how='left', on=['srch_destination_id','hotel_country','hotel_market'])
        pre_deal(chunk)
        y = chunk.hotel_cluster
        chunk.drop(['cnt', 'hotel_cluster', 'is_booking'], axis=1, inplace=True)
        if len(y.unique()) == 100:
            clf.set_params(n_estimators=clf.n_estimators+1)
            clf.fit(chunk, y)
        count = count + chunksize
        print(count,' have done')
        if(count/chunksize == 300):
            break
    except Exception as e:
        print(str(e))
        pass

count = 0
chunksize = 10000
preds = np.empty((submission.shape[0],clf.n_classes_))
reader = pd.read_csv('test.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
for chunk in reader:
    chunk = pd.merge(chunk, destinations, how='left', on='srch_destination_id')
    chunk = pd.merge(chunk, agg1, how='left', on=['srch_destination_id','hotel_country','hotel_market'])
    chunk.drop(['id'], axis=1, inplace=True)
    pre_deal(chunk)
    pred = clf.predict_proba(chunk)
    preds[count:(count + chunk.shape[0]),:] = pred
    count = count + chunksize
    print(count,' have done')
del clf,agg1

if os.path.exists('rf.h5'):
    with h5py.File('rf.h5', 'r+') as hf:
            predslatesthf = hf['preds_latest']
            preds += predslatesthf.value
            predslatesthf[...] = preds
else:
    with h5py.File('rf.h5', 'w') as hf:
        hf.create_dataset('preds_latest', data=preds)
fea_ind = np.argsort(-preds, axis=1)[:,:5]
happend = [' '.join(row.astype(str)) for row in fea_ind]
submit = pd.DataFrame(data=happend, index=submission.id)
submit.reset_index(inplace=True)
submit.columns = submission.columns
submit.to_csv('rf_deal.csv', index=False)

3.SGDLR

# -*- coding: utf-8 -*-
import pandas as pd
from scipy.sparse import csr_matrix, hstack
import numpy as np
import h5py
import pickle
from sklearn.linear_model import SGDClassifier
import os
cat_col = ['user_id','user_location_city','srch_destination_id','srch_destination_type_id',
           'hotel_continent','hotel_country', 'hotel_market']
num_col = ['is_mobile', 'is_package']
def bin_time(t):
    if t < 0:
        x = 0
    elif t < 2:
        x = 1
    elif t < 7:
        x = 2
    elif t < 30:
        x = 3
    else:
        x = 4
    return x

def pre_process(data):
    try:
        data.loc[data.srch_ci.str.endswith('00'),'srch_ci'] = '2015-12-31'
        data['srch_ci'] = data.srch_ci.astype(np.datetime64)
        data.loc[data.date_time.str.endswith('00'),'date_time'] = '2015-12-31'
        data['date_time'] = data.date_time.astype(np.datetime64)
    except:
        pass
    data.fillna(0, inplace=True)
    data['ci_month'] = data['srch_ci'].apply(lambda dt: dt.month)
    data['season_dest'] = 'season_dest' + data.ci_month.map(str) + '*' + data.srch_destination_id.map(str)
    data['season_dest'] = data['season_dest'].map(hash)
    data['date_to_live_days'] = data.srch_ci-data.date_time
    data['date_to_live_days'] = data['date_to_live_days'].apply(lambda td: td/np.timedelta64(1, 'D'))
    data['date_to_live_days'] = data['date_to_live_days'].map(bin_time)
    data['time_dest'] = 'time_dest' + data.date_to_live_days.map(str) + '*' + data.srch_destination_id.map(str)
    data['time_dest'] = data['time_dest'].map(hash)
    
    for col in cat_col:
        data[col] = col + data[col].map(str)
        data[col] = data[col].map(hash)

submission = pd.read_csv('sample_submission.csv')
cat_col_all = cat_col + ['season_dest', 'time_dest']
def map5eval(preds, actual):
    '''evaluate standard'''
    predicted = preds.argsort(axis=1)[:,-np.arange(5)]
    metric = 0.
    for i in range(5):
        metric += np.sum(actual==predicted[:,i])/(i+1)
    metric /= actual.shape[0]
    return metric

if os.path.exists('sgd.pkl'):
    with open('sgd.pkl', 'rb') as f:
        clf = pickle.load(f)
else:
    clf = SGDClassifier(loss='log', alpha=0.0000025, verbose=0)
#clf.sparsify()
for epoch in range(5):
    count = 0
    chunksize = 200000
    n_features = 3000000
    print('Epoch: ', epoch)
    reader = pd.read_csv('train.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
    for chunk in reader:
        try:
            pre_process(chunk)
            y = chunk.hotel_cluster
            sw = 1 + 4*chunk.is_booking
            chunk.drop(['cnt', 'hotel_cluster', 'is_booking'], axis=1, inplace=True)
            XN = csr_matrix(chunk[num_col].values)
            X = csr_matrix((chunk.shape[0], n_features))
            rows = np.arange(chunk.shape[0])
            for col in cat_col_all:
                dat = np.ones(chunk.shape[0])
                cols = chunk[col] % n_features
                X += csr_matrix((dat, (rows, cols)), shape=(chunk.shape[0], n_features))
            X = hstack((XN, X))
            book_indices = sw[sw > 1].index.tolist()
            x_indices=[(x-count) for x in book_indices]
            X_test = csr_matrix(X)[x_indices]
            y_test = y[book_indices]
            clf.partial_fit(X, y, classes=np.arange(100), sample_weight=sw)         
            count = count + chunksize
            map5 = map5eval(clf.predict_proba(X_test), y_test)
            print((count, map5),' have done')
            if(count/chunksize == 200):
                break
        except Exception as e:
            count = count + chunksize
            print(str(e))
            pass

with open('sgd.pkl', 'wb') as f:
    pickle.dump(clf, f)

count = 0
chunksize = 10000
preds = np.empty((0,100))
reader = pd.read_csv('test.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
for chunk in reader:
    chunk.drop(['id'], axis=1, inplace=True)
    pre_process(chunk)
    XN = csr_matrix(chunk[num_col].values)
    X = csr_matrix((chunk.shape[0], n_features))
    rows = np.arange(chunk.shape[0])
    for col in cat_col_all:
        dat = np.ones(chunk.shape[0])
        cols = chunk[col] % n_features
        X += csr_matrix((dat, (rows, cols)), shape=(chunk.shape[0], n_features))
    X = hstack((XN, X))
    pred = clf.predict_proba(X)
    preds = np.vstack((preds, pred))
    count = count + chunksize
    print(count,' have done')
del clf

if os.path.exists('sgd.h5'):
    with h5py.File('sgd.h5', 'r+') as hf:
        predshf = hf['preds']
        predshf[...] = preds
else:
    with h5py.File('sgd.h5', 'w') as hf:
        hf.create_dataset('preds', data=preds)

col_ind = np.argsort(-preds, axis=1)[:,:5]
hc = [' '.join(row.astype(str)) for row in col_ind]
submit = pd.DataFrame(data=hc, index=submission.id)
submit.reset_index(inplace=True)
submit.columns = submission.columns
submit.to_csv('sgdlr_deal.csv', index=False)

4.blend

# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
import h5py

submission = pd.read_csv('sample_submission.csv')

# read in RF results
with h5py.File('rf.h5', 'r') as hf:
        predshf = hf['preds_latest']
        preds = 0.54*normalize(predshf.value, norm='l1', axis=1)


# read in SGD results
with h5py.File('../output/probs/allpreds_sgd.h5', 'r') as hf:
        predshf = hf['preds']
        preds += 0.46*normalize(predshf.value, norm='l1', axis=1)


col_ind = np.argsort(-preds, axis=1)[:,:5]
hc = [' '.join(row.astype(str)) for row in col_ind]

sub = pd.DataFrame(data=hc, index=submission.id)
sub.reset_index(inplace=True)
sub.columns = submission.columns
sub.to_csv('blend_deal.csv', index=False)

5.stack

# -*- coding: utf-8 -*-
import pandas as pd

match_pred = pd.read_csv('leakage_deal.csv')
match_pred.fillna('', inplace=True)
match_pred = match_pred['hotel_cluster'].tolist()
match_pred = [s.split(' ') for s in match_pred]

pred_sub = pd.read_csv('blend_deal.csv')
ids = pred_sub.id
pred_sub = pred_sub['hotel_cluster'].tolist()
pred_sub = [s.split(' ') for s in pred_sub]

def f0(seq, idfun=None): 
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        if (marker in seen) or (marker == ''): continue
        seen[marker] = 1
        result.append(item)
    return result
    
full_preds = [f0(match_pred[p] + pred_sub[p])[:5] for p in range(len(pred_sub))]

write_p = [" ".join([str(l) for l in p]) for p in full_preds]
write_frame = ["{0},{1}".format(ids[i], write_p[i]) for i in range(len(full_preds))]
write_frame = ["id,hotel_cluster"] + write_frame
with open("final_predictions.csv", "w+") as f:
    f.write("\n".join(write_frame))

结果:


image.png

image.png
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 204,921评论 6 478
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 87,635评论 2 381
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 151,393评论 0 338
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 54,836评论 1 277
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 63,833评论 5 368
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 48,685评论 1 281
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 38,043评论 3 399
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 36,694评论 0 258
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 42,671评论 1 300
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 35,670评论 2 321
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 37,779评论 1 332
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 33,424评论 4 321
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 39,027评论 3 307
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 29,984评论 0 19
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 31,214评论 1 260
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 45,108评论 2 351
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 42,517评论 2 343

推荐阅读更多精彩内容

  • [TOC] About Trs 只是阅读过程中对其中一些进行注脚而已,更确切的内容还是英文原文来的清晰,有些翻译反...
    mrlevo520阅读 1,180评论 0 0
  • by:孤鸟差鱼 时光不懂人情 我也害你哭的老惨 不过我没有时光坏 手里的那把刀总是钝了些许 还懂点人情味
    孤鸟差鱼阅读 118评论 0 4
  • 今晚路灯没亮 大概是你飞远了 以前你住在那圆圆的灯里 圆圆的像你的两只白兔子 我站在阳台上看你 月亮熄着灯也熄着 你亮着
    中习习阅读 118评论 3 6
  • 人与人之间真美好,我觉得我还是挺幸运的。有些沟通真的不用语言的,就好像不看着他,也能心连接~静静地体会,多美好~
    七月不繁花阅读 226评论 0 0
  • 茶,国之佳饮;礼之尚品。初出川地,古属巴蜀,后种植各地,为祭、食、饮、药之用,素为士人所钟爱。茶有五名,一曰茶,二...
    清華來也阅读 456评论 0 0