2020-07-17 暑期学习日更计划 (李宏毅2020-hw2 pytorch实现)

ML2020spring - hw2

该作业kaggle地址:ML2020spring - hw2 Classification - Binary Income Prediction
一个关于数据的二项分类问题,由一系列给出的特征数据,判断此人的收入是否大于5000元。
数据处理部分沿用了numpy的数据处理方式,神经网络的实现部分用了pytroch

数据预处理部分:

import os
import csv
import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt

X_train_fpath = './lhy_DL_Hw/hw2_data/X_train'
Y_train_fpath = './lhy_DL_Hw/hw2_data/Y_train'
X_test_fpath = './lhy_DL_Hw/hw2_data/X_test'
output_fpath = './lhy_DL_Hw/output_{}.csv'

with open(X_train_fpath) as f:
    next(f)
    X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
with open(Y_train_fpath) as f:
    next(f)
    Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype = float)
with open(X_test_fpath) as f:
    next(f)
    X_test = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)

def _train_dev_split(X, Y, dev_ratio = 0.25):
    # This function spilts data into training set and development set.
    train_size = int(len(X) * (1 - dev_ratio))
    return X[:train_size], Y[:train_size], X[train_size:], Y[train_size:]


def _normalize(X, train=True, specified_column=None, X_mean=None, X_std=None):

    if specified_column == None:
        specified_column = np.arange(X.shape[1])
    if train:
        X_mean = np.mean(X[:, specified_column], 0).reshape(1, -1)
        X_std = np.std(X[:, specified_column], 0).reshape(1, -1)

    X[:, specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)

    return X, X_mean, X_std

dev_ratio = 0.1
X_train, Y_train, X_dev, Y_dev = _train_dev_split(X_train, Y_train, dev_ratio = dev_ratio)
X_train, X_mean, X_std =_normalize(X_train)

X_dev,_,_=_normalize(X_dev,X_mean=X_mean,X_std=X_std)
X_test,_,_=_normalize(X_test,X_mean=X_mean,X_std=X_std)

构造数据集,并搭建神经网络:

X_train=torch.from_numpy(X_train)
Y_train=torch.from_numpy(Y_train)
X_dev=torch.from_numpy(X_dev)
Y_dev=torch.from_numpy(Y_dev)

X_test=torch.from_numpy(X_test)

train_dataset=torch.utils.data.TensorDataset(X_train,Y_train)
dev_dataset=torch.utils.data.TensorDataset(X_dev,Y_dev)
test_dataset=torch.utils.data.TensorDataset(X_test)

train_loader=torch.utils.data.DataLoader(train_dataset,shuffle=True,batch_size=256)
dev_loader=torch.utils.data.DataLoader(dev_dataset,shuffle=True,batch_size=256)
test_loader=torch.utils.data.DataLoader(test_dataset,shuffle=True,batch_size=256)

print(X_train.shape,len(X_train[0]),Y_train.shape)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
input_size=510
output_size=2
learning_rate=0.0001
epoches=15
PATH=".\\hw2_classification_save\\classification_best_parameter.pkl"

model=torch.nn.Sequential(
    torch.nn.Linear(input_size,512),
    torch.nn.ReLU(),
    torch.nn.Linear(512,72),
    torch.nn.Dropout(0.5),
    torch.nn.ReLU(),
    torch.nn.Linear(72,output_size),
    torch.nn.ReLU(),

)
model.to(device)
# model=torch.nn.Linear(input_size,output_size)
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

开始训练:

training_loss_list=[]
dev_loss_list=[]
acc_list=[]
dev_acc_list=[]
max_acc=0
if os.path.exists(PATH):
    m=model.load_state_dict(torch.load(PATH))
    print(m)
else:
    print("Training Start")
    for epoch in range(epoches):
        running_loss=0
        dev_loss_total=0
        correct_total=0
        labels_total=0
        for i,data in enumerate(train_loader):
            inputs,labels=data
            inputs = inputs.to(device)
            labels=labels.to(device)

            # print(i,inputs,labels)
            optimizer.zero_grad()

            inputs=torch.tensor(inputs,dtype=torch.float32)
            inputs = inputs.to(device)
            outputs=model(inputs)
            loss=criterion(outputs,labels.long())
            loss.backward()
            optimizer.step()
            running_loss+=loss.item()

            _, predict = torch.max(outputs, 1)
            correct_total+=(predict == labels).sum().item()
            labels_total+=len(labels)

        acc = correct_total/labels_total
        acc_list.append(acc)
        training_loss_list.append(running_loss/labels_total)
        if epoch%1==0:
            # print(i)
            print("epoch",epoch,"loss={:.5}".format(running_loss/labels_total),"acc={:.5}".format(acc))
            dev_loss = 0
            dev_acc = 0
            dev_correct_total = 0
            dev_labels_total = 0
#一边训练一边验证
            with torch.no_grad():
                for data in dev_loader:
                    dev_inputs, dev_labels = data
                    dev_inputs = dev_inputs.to(device)
                    dev_labels = dev_labels.to(device)

                    dev_outputs = model(dev_inputs.float())
                    loss = criterion(dev_outputs, dev_labels.long())
                    dev_loss += loss.item()
                    _, dev_predict = torch.max(dev_outputs, 1)
                    dev_correct_total += (dev_predict == dev_labels).sum().item()
                    dev_labels_total += len(dev_labels)

                    dev_acc = dev_correct_total / dev_labels_total
                dev_loss_list.append(dev_loss/ dev_labels_total)
                dev_acc_list.append(dev_acc)
                print("[dev_loss]={:.5}".format(dev_loss / dev_labels_total), "[dev_acc]={:.5}".format(dev_acc))

            if dev_acc>max_acc:
                max_acc=dev_acc
                torch.save(model.state_dict(), PATH)
                print("model saved,max_acc=",max_acc)
#选取在测试集中表现最好的模型,并保存

    plt.plot(np.arange(epoches),training_loss_list)
    plt.plot(np.arange(epoches),dev_loss_list)
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()
#损失值的图像
    plt.plot(np.arange(epoches),acc_list)
    plt.plot(np.arange(epoches),dev_acc_list)
    plt.xlabel("Epoch")
    plt.ylabel("Acc")
    plt.show()
    print("Finshed Training")
#精确度的图像

输入测试集并生成要提交csv文件

X_test = X_test.to(device)
test_predict=model(X_test.float())
_,test_predict=torch.max(test_predict,1)
print("len=",len(test_predict),test_predict)

with open('classification_submit.csv', mode='w', newline='') as submit_file:
    csv_writer = csv.writer(submit_file)
    header = ['id', 'label']
    csv_writer.writerow(header)
    for i in range(len(test_predict)):
        row = [ str(i), test_predict[i].item()]
        csv_writer.writerow(row)
        # print(row)


自此,该作业全部完成。
在kaggle上与别人比较了最后的得分,并多次修改了网络模型结构,然而该模型的表现仍旧不理想,在参考了别人的作业后发现,在完成了训练集训练和验证集的验证后,还可以把验证集作为训练集,通过增加数据集的形式优化模型参数。

代码如下:

import os
import csv
import torch
from torch import nn
import numpy as np

X_train_fpath = './lhy_DL_Hw/hw2_data/X_train'
Y_train_fpath = './lhy_DL_Hw/hw2_data/Y_train'
X_test_fpath = './lhy_DL_Hw/hw2_data/X_test'
output_fpath = './lhy_DL_Hw/output_{}.csv'

with open(X_train_fpath) as f:
    next(f)
    X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
with open(Y_train_fpath) as f:
    next(f)
    Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype = float)
with open(X_test_fpath) as f:
    next(f)
    X_test = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)

def _normalize(X, train=True, specified_column=None, X_mean=None, X_std=None):

    if specified_column == None:
        specified_column = np.arange(X.shape[1])
    if train:
        X_mean = np.mean(X[:, specified_column], 0).reshape(1, -1)
        X_std = np.std(X[:, specified_column], 0).reshape(1, -1)

    X[:, specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)

    return X, X_mean, X_std

X_train, X_mean, X_std =_normalize(X_train)

X_test,_,_=_normalize(X_test,X_mean=X_mean,X_std=X_std)

#在这里没有切分数据集为 训练集与验证集,而是全部作为训练集,根据上一次训练,调节好的超参数直接进行训练
X_train=torch.from_numpy(X_train)
Y_train=torch.from_numpy(Y_train)

print(X_train.shape)

X_test=torch.from_numpy(X_test)

train_dataset=torch.utils.data.TensorDataset(X_train,Y_train)
test_dataset=torch.utils.data.TensorDataset(X_test)

train_loader=torch.utils.data.DataLoader(train_dataset,shuffle=True,batch_size=512)
test_loader=torch.utils.data.DataLoader(test_dataset,shuffle=True,batch_size=512)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
input_size=510
output_size=2
learning_rate=0.0001
epoches=15
PATH=".\\hw2_classification_save\\classification_parameter_update.pkl"

model=torch.nn.Sequential(
    torch.nn.Linear(input_size,512),
    torch.nn.ReLU(),
    torch.nn.Linear(512,72),
    torch.nn.Dropout(0.5),
    torch.nn.ReLU(),
    torch.nn.Linear(72,output_size),
    torch.nn.ReLU(),

)

model.to(device)
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)

training_loss_list=[]
acc_list=[]

if os.path.exists(PATH):
    model.load_state_dict(torch.load(PATH))
else:
    for epoch in range(epoches):
        running_loss=0
        dev_loss_total=0
        correct_total=0
        labels_total=0
        for i,data in enumerate(train_loader):
            inputs,labels=data
            inputs = inputs.to(device)
            labels=labels.to(device)

            # print(i,inputs,labels)
            optimizer.zero_grad()

            inputs=torch.tensor(inputs,dtype=torch.float32)
            inputs = inputs.to(device)
            outputs=model(inputs)
            loss=criterion(outputs,labels.long())
            loss.backward()
            optimizer.step()
            running_loss+=loss.item()

            _, predict = torch.max(outputs, 1)
            correct_total+=(predict == labels).sum().item()
            labels_total+=len(labels)

        acc = correct_total/labels_total
        acc_list.append(acc)
        training_loss_list.append(running_loss/labels_total)
        if epoch%1==0:
            # print(i)
            print("epoch",epoch,"loss={:.5}".format(running_loss/labels_total),"acc={:.5}".format(acc))
    torch.save(model.state_dict(), PATH)


X_test = X_test.to(device)
test_predict=model(X_test.float())
_,test_predict=torch.max(test_predict,1)
print("len=",len(test_predict),test_predict)

with open('classification_submit_update.csv', mode='w', newline='') as submit_file:
    csv_writer = csv.writer(submit_file)
    header = ['id', 'label']
    csv_writer.writerow(header)
    for i in range(len(test_predict)):
        row = [ str(i), test_predict[i].item()]
        csv_writer.writerow(row)
        # print(row)

二项分类模型实现!

©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 216,142评论 6 498
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 92,298评论 3 392
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 162,068评论 0 351
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 58,081评论 1 291
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 67,099评论 6 388
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 51,071评论 1 295
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 39,990评论 3 417
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 38,832评论 0 273
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 45,274评论 1 310
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 37,488评论 2 331
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 39,649评论 1 347
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 35,378评论 5 343
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 40,979评论 3 325
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 31,625评论 0 21
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 32,796评论 1 268
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 47,643评论 2 368
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 44,545评论 2 352