Python随笔-pandas、风控评分卡

1、项目涉及到的一些函数

# -*- coding: utf-8 -*-
# %%time
# from pyhive import presto
import pandas as pd
import numpy as np
import warnings
import os
from pyhive import presto
import matplotlib.pyplot as plt
import sys
from tqdm import tqdm
from sklearn.externals import joblib
from joblib import Parallel,delayed
import scorecardpy as sc
import toad
import datetime, calendar   
import time
from datetime import timedelta, date
from time import *

import toad
import pydotplus
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn import tree
from pandas import DataFrame
from sklearn.tree import _tree
from functools import reduce  


%matplotlib inline

#透视表功能,agg里面写函数就是按函数统计,不写就是统计values的占比,agg内可以选,count,min,max,sum,mean,len
#按照列为index,行为columns进行透视展示values内对象的分布情况,空值填充0
# pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['bins'],values=['mob4'],fill_value=0,aggfunc=['count']) 


#设置行,列,列宽等
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth', 500)

cursor=presto.connect('IP',端口,'jobschedule').cursor()

def read_sql(hql):
    cursor.execute(hql)
    try:
        result = cursor.fetchall()
        num_columns = len(cursor.description)
        columns_names = [i[0] for i in cursor.description]
    except Exception as e:
        print(num_columns)
        print(columns_names)
    data = pd.DataFrame(list(result),columns=columns_names)
    return data

####################### PlotKS ##########################
def PlotKS(preds, labels, n=20, asc=True):
    
    # preds is score: asc=1
    # preds is prob: asc=0
    
    pred = preds  # 预测值
    bad = labels  # 取1为bad, 0为good
    ksds = pd.DataFrame({'bad': bad, 'pred': pred})
    ksds['good'] = 1 - ksds.bad
    
    if asc == 1:
        ksds1 = ksds.sort_values(by=['pred', 'bad'], ascending=[True, True])
    elif asc == 0:
        ksds1 = ksds.sort_values(by=['pred', 'bad'], ascending=[False, True])
    ksds1.index = range(len(ksds1.pred))
    ksds1['cumsum_good1'] = 1.0*ksds1.good.cumsum()/sum(ksds1.good)
    ksds1['cumsum_bad1'] = 1.0*ksds1.bad.cumsum()/sum(ksds1.bad)
    
    if asc == 1:
        ksds2 = ksds.sort_values(by=['pred', 'bad'], ascending=[True, False])
    elif asc == 0:
        ksds2 = ksds.sort_values(by=['pred', 'bad'], ascending=[False, False])
    ksds2.index = range(len(ksds2.pred))
    ksds2['cumsum_good2'] = 1.0*ksds2.good.cumsum()/sum(ksds2.good)
    ksds2['cumsum_bad2'] = 1.0*ksds2.bad.cumsum()/sum(ksds2.bad)
    
    # ksds1 ksds2 -> average
    ksds = ksds1[['cumsum_good1', 'cumsum_bad1']]
    ksds['cumsum_good2'] = ksds2['cumsum_good2']
    ksds['cumsum_bad2'] = ksds2['cumsum_bad2']
    ksds['cumsum_good'] = (ksds['cumsum_good1'] + ksds['cumsum_good2'])/2
    ksds['cumsum_bad'] = (ksds['cumsum_bad1'] + ksds['cumsum_bad2'])/2
    
    # ks
    ksds['ks'] = ksds['cumsum_bad'] - ksds['cumsum_good']
    ksds['tile0'] = range(1, len(ksds.ks) + 1)
    ksds['tile'] = 1.0*ksds['tile0']/len(ksds['tile0'])
    
    qe = list(np.arange(0, 1, 1.0/n))
    qe.append(1)
    qe = qe[1:]
    
    ks_index = pd.Series(ksds.index)
    ks_index = ks_index.quantile(q = qe)
    ks_index = np.ceil(ks_index).astype(int)
    ks_index = list(ks_index)
    
    ksds = ksds.loc[ks_index]
    ksds = ksds[['tile', 'cumsum_good', 'cumsum_bad', 'ks']]
    ksds0 = np.array([[0, 0, 0, 0]])
    ksds = np.concatenate([ksds0, ksds], axis=0)
    ksds = pd.DataFrame(ksds, columns=['tile', 'cumsum_good', 'cumsum_bad', 'ks'])
    
    ks_value =abs(ksds.ks).max() #20200502-调整增加abs
    ks_pop = ksds.tile[ksds.ks.idxmax()]
    print ('ks_value is ' + str(np.round(ks_value, 4)) + ' at pop = ' + str(np.round(ks_pop, 4)))
    
    # chart
    plt.plot(ksds.tile, ksds.cumsum_good, label='cum_good',
                         color='blue', linestyle='-', linewidth=2)
                         
    plt.plot(ksds.tile, ksds.cumsum_bad, label='cum_bad',
                        color='red', linestyle='-', linewidth=2)
                        
    plt.plot(ksds.tile, ksds.ks, label='ks',
                   color='green', linestyle='-', linewidth=2)
                       
    plt.axvline(ks_pop, color='gray',  linestyle='--')
    plt.axhline(ks_value, color='green',  linestyle='--')
    plt.axhline(ksds.loc[ksds.ks.idxmax(), 'cumsum_good'], color='blue', linestyle='--')
    plt.axhline(ksds.loc[ksds.ks.idxmax(),'cumsum_bad'], color='red', linestyle='--')
    plt.title('KS=%s ' %np.round(ks_value, 4) +  
                'at Pop=%s' %np.round(ks_pop, 4), fontsize=15) 
    plt.legend()
    return ksds

#逾期打标
def fun(x):
    if x >= 30:
        return 1
    else:
        return 0

#灰用户打标    
def Grey(x):
    if (x > 0)&(x<30):
        return 1
    else:
        return 0

#返回日期的年份月份
def getYearMonth(x):  
    return str(x)[0:7]  

#计算分组后的区间用户占比
def ration(x):
    return x.sum()/x.count()


class rule_tree(object):
    def __init__(self, datasets, ex_lis, dep='bad_ind', min_samples=0.05, min_samples_leaf=200, min_samples_split=50,
                 max_depth=3 ):
        '''
        目前规则变量只支持数值型
        datasets:数据集 dataframe格式
        ex_lis:不参与建模的特征,如id,时间切片等。 list格式  
        min_samples:分箱时最小箱的样本占总比 numeric格式
        max_depth:决策树最大深度 numeric格式
        min_samples_leaf:决策树子节点最小样本个数 numeric格式
        min_samples_split:决策树划分前,父节点最小样本个数 numeric格式
        '''
        self.datasets = datasets
        self.ex_lis = ex_lis
        self.dep = dep
        self.max_depth = max_depth
        self.min_samples = min_samples
        self.min_samples_leaf = min_samples_leaf
        self.min_samples_split = min_samples_split

    def tree_to_code(self, tree, feature_names):
        tree_ = tree.tree_
        feature_name = [feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!" for i in tree_.feature]
        print ("def tree({}):".format(", ".join(feature_names)))
         
        p_value = [i[0][0] for i in dtree.tree_.value]
        
        def recurse(node, depth):
            indent = "  " * depth
            if tree_.feature[node] != _tree.TREE_UNDEFINED:
                name = feature_name[node]
                threshold = tree_.threshold[node]
                print ("{}if {} <= {}:".format(indent, name, threshold))
                recurse(tree_.children_left[node], depth + 1)
                print ("{}if {} > {}".format(indent, name, threshold))
                #print ("{}else:".format(indent))
                recurse(tree_.children_right[node], depth + 1)
            else:
                print ("{}return {}".format(indent, tree_.value[node]))
                    

        recurse(0, 1)
    
    def get_lineage(self, tree, feature_names):
        left      = tree.tree_.children_left
        right     = tree.tree_.children_right
        threshold = tree.tree_.threshold
        features  = [feature_names[i] for i in tree.tree_.feature]

        # get ids of child nodes
        idx = np.argwhere(left == -1)[:,0]     

        def recurse(left, right, child, lineage=None):          
            if lineage is None:
                lineage = [child]
            if child in left:
                parent = np.where(left == child)[0].item()
                split = 'l'
            else:
                parent = np.where(right == child)[0].item()
                split = 'r'

            lineage.append((parent, split, threshold[parent], features[parent])) 

            if parent == 0:
                lineage.reverse()
                return lineage
            else:
                return recurse(left, right, parent, lineage)
        node_list = []
        for child in idx:
            for node in recurse(left, right, child):
                #print(node)
                node_list.append(node)
        return node_list
    
    def get_interval_node(self, node_list):
        node_dict = {}
        node_id = 1
        for id,i in enumerate(node_list[:-1]):
            if id==0:
                node_dict[node_id] = [i[1:]]
            if id>0:
                if type(node_list[id-1])==tuple:
                    if type(i)==tuple:
                        node_dict[node_id].append(i[1:])
                    else:
                        node_id += 1
                        node_dict[node_id] = []
                else:
                    node_dict[node_id].append(i[1:])
        return node_dict

    def rules(self,tree,feature_names):
        node_list = self.get_lineage(tree,feature_names)
        node_dict = self.get_interval_node(node_list)
        rules_dict = {}
        con_cal = []
        for k,v in node_dict.items():
            con = []
            con_cal_ = []
            for v_ in v:
                con.append("({} < {})".format(v_[2],v_[1]) if v_[0]=='l' else "({} >= {})".format(v_[2],v_[1]))
                con_cal_.append((self.datasets[v_[2]] < v_[1]) if v_[0]=='l' else (self.datasets[v_[2]] >= v_[1]))

            con_cal.append(reduce(lambda x, y: x*y,con_cal_))
            rules_dict[k] = ['&'.join(con),round(self.datasets[con_cal[-1]][self.dep].mean(),4)]

        rules = pd.DataFrame(rules_dict).T
        rules.columns = ['规则','p']

        return con_cal,rules
    
    def rule_indicator(self, d, cons):
        indicator = {}
        
        for id,con in enumerate(cons):

            reject = np.sum(~con)
            fugai=np.sum(con)
            total=np.sum(con)+np.sum(~con)
            wuju = np.sum((~con)*(d[self.dep]==0))
            wufang = d[con][self.dep].sum()
            lanjie_hei=total-fugai-wuju
            shengyu_bai=fugai-wufang
            fugai_lv=round(fugai / total,4)
            
            default_rate = round(wufang / np.sum(con),4)
            default_rate = round(wufang / np.sum(con),4) 
            precise = round((reject - wuju) / reject,4)
            recall = round((reject - wuju) / d[self.dep].sum(),4)
            distrub = round(np.sum((~con)*(d[self.dep]==0)) / np.sum(d[self.dep]==0),4)
            ks = round(recall - distrub,4)
            ## lift 累计坏/累计总/平均逾期率
            lift = round(default_rate /(d[self.dep].mean()),4)

            
            indicator[id+1] = {'总拦截量':reject,'拦截坏样本':lanjie_hei,'拦截白样本':wuju,'通过量':fugai,'误放量':wufang,'误放率(逾期率)':default_rate,'总样本':total,'通过率':fugai_lv,
                           '准确率':precise,'召回率':recall,'打扰率':distrub,'ks':ks,'lift倍数':lift}
        indicator = pd.DataFrame(indicator).T
        indicator[['总拦截量','拦截坏样本','拦截白样本','通过量','误放量','总样本']] = indicator[['总拦截量','拦截坏样本','拦截白样本','通过量','误放量','总样本']].astype(int)
  
        
        return indicator

    
    def fit_plot(self):
        # DecisionTreeClassifier 可分类算法
        dtree = tree.DecisionTreeRegressor(max_depth=self.max_depth,
                                           min_samples_leaf=self.min_samples_leaf,
                                           min_samples_split=self.min_samples_split)
        x = self.datasets.drop(self.ex_lis, axis=1)
        y = self.datasets[self.dep]
        dtree = dtree.fit(x, y)
        with open("dt.dot", "w") as f:
            tree.export_graphviz(dtree, out_file=f)
        dot_data = StringIO()
        tree.export_graphviz(dtree, out_file=dot_data,
                             feature_names=x.columns,
                             class_names=[self.dep],
                             filled=True, rounded=True,
                             special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        #self.tree_to_code(dtree,x.columns)
        print('\n\n')
        
        con_cal,rules = self.rules(dtree, x.columns)

        indicator = self.rule_indicator(self.datasets, con_cal)
        
        cols = ['规则','总拦截量','拦截坏样本','拦截白样本','通过量','误放量','总样本','通过率', '误放率(逾期率)','准确率', '召回率', '打扰率', 'ks', 'lift倍数']
        rule = rules.merge(indicator,left_index=True,right_index=True)[cols]
        
        rule.sort_values('误放率(逾期率)',ascending=True,inplace=True)
        
        return rule,graph.create_png()

def evaluate_func1(df,cols=None,groups=None,label='y',descs=None):
    '''
    注意:这里注意非空样本和空样本是分开计算的
    df:输入的数据,dataframe
    cols:指定需要评估的列,列表
    groups:指定需要分组的列,列表,注意分组的列值最好没有空值(自行填充一下)
    label:指定标签的列名(标签中1黑,0白)
    dropna:计算时是否去掉空值的行
    descs:需降序分箱累计的列,默认全部按升序分箱累计,即低分数分箱往上累计加到高分数分箱上
    return:返回dataframe,列名如下
        'group_set':分组
        'var_name':'变量名'
        'flag':'拦截区间'
        'total':样本数
        'var_empty':空值个数
        'var_get':变量覆盖率(非空)
        'stp':'拦截样本数'
        'pos':'黑样本数'
        'neg':'白样本数'
        'cunsum_pos':'累计黑样本数'
        'cunsum_neg':'累计白样本数'
        'cunsum_stp':'累计拦截数'
        'total_pos':总体黑样本数
        'total_neg':总体白样本数
        'total_stp':总体样本数
        'intercept':'拦截率'
        'precision':'准确率'
        'recall':'召回率'
        'disturb':'打扰率'
        'cum_precision':'累计准确率'
        'avg_precision':'平均准确率'
        'cum_recall':'累计召回率'
        'cum_disturb':'累计打扰率'
        'bks':'ks区间值'
        'ks':'ks值'     
    '''
    df=df.apply(pd.to_numeric,errors='ignore')
    df[df.select_dtypes('object').columns.tolist()]=df.select_dtypes('object').apply(pd.to_datetime,errors='ignore')
    from tqdm import tqdm
    # 计算指标
    def define_mertrics(df,label,descs=None):
        
        def __num_q(ser,dropna):
                num=ser.nunique(dropna)
                if num>25:
                    q=10
                elif num>15:
                    q=8
                elif num>=10:
                    q=6
                else:
                    q=num
                return q
        dts=df.select_dtypes('datetime64[ns]').shape[1]
        objs=df.select_dtypes('object').shape[1]
        nums=len(df.columns.tolist())-dts-objs
        print('data has {} datetime columns,{} object columns,{} numeric columns'.format(dts,objs,nums))
        res=[]
        if not descs:
            descs=[]
        for col in tqdm(df.columns.difference([label])):
            na_stp=df[df[col].isna()==True].shape[0]
            na_pos=df[df[col].isna()==True][label].sum()
            na_neg=na_stp-na_pos
            var_get=df.shape[0]-na_stp
            total=df.shape[0]
            ser=df[df[col].isna()==False][col]
            tmp=pd.DataFrame()
            
            # 数值型且取值个数大于10
            if ser.dtype!=object and ser.dtype!='datetime64[ns]' and ser.nunique()>=10:
                q=__num_q(ser,dropna=True)
                _,bins=pd.qcut(ser,q=q,retbins=True,precision=2,duplicates='drop',labels=False)
                bins[0]=bins[0]-0.01
                cuts=pd.cut(ser,bins=bins)
                tmp['flag']=cuts # 分箱
                tmp['var_name']= col  # 变量 
                tmp['label']=df[label] # 标签
            #数值型且取值个数小于10    
            elif ser.dtype!=object and ser.dtype!='datetime64[ns]' and ser.nunique()<10:
                tmp['flag']=ser
                tmp['var_name']=col
                tmp['label']=df[label]
            #非数值型(类别型)
            else:
                tmp['flag']=ser
                tmp['var_name']=col
                tmp=tmp.astype(str)
                tmp['label']=df[label]                
            #计算非空样本
            result_stp=tmp.groupby(['var_name','flag']).count().rename(columns={'label':'stp'})
            result_pos=tmp.groupby(['var_name','flag'])['label'].sum().to_frame().rename(columns={'label':'pos'}).astype(int) #黑样本数
            result_neg=tmp[tmp['label']==0].groupby(['var_name','flag'])['label'].count().to_frame().rename(columns={'label':'neg'}).astype(int) # 白样本数目
            merge_result=result_stp.merge(result_pos,how='left',on=['var_name','flag']).merge(result_neg,how='left',on=['var_name','flag'])
            merge_result.reset_index(inplace=True)
            merge_result['rank']=pd.Series([i+1 for i in range(merge_result.shape[0])])
            if ser.dtype!=object and ser.dtype!='datetime64[ns]' and ser.nunique()>=10:
                merge_result['flag'] = merge_result['flag'].cat.add_categories([0])
            merge_result.fillna(0,inplace=True)
            if col in descs:
                merge_result.sort_values(by=['rank'],axis=0,ascending=False,inplace=True,na_position='last')    
            merge_result['cunsum_pos']=merge_result.groupby('var_name')['pos'].cumsum().astype(int) #累计黑样本
            merge_result['cunsum_neg']=merge_result.groupby('var_name')['neg'].cumsum().astype(int) #累计白样本
            merge_result['cunsum_stp']=merge_result.groupby('var_name')['stp'].cumsum().astype(int) #累计拦截数
            merge_result['total_pos']=merge_result['cunsum_pos'].max()
            merge_result['total_neg']=merge_result['cunsum_neg'].max()
            merge_result['total_stp']=merge_result['cunsum_stp'].max()
            #空样本计算
            tmpna=pd.DataFrame()
            columns=['var_name','flag','stp','pos','neg','rank','cunsum_pos','cunsum_neg','cunsum_stp','total_pos','total_neg','total_stp']
            tmpna[columns]=pd.DataFrame(np.array([[col,'empty',na_stp,na_pos,na_neg,0,na_pos,na_neg,na_stp,na_pos,na_neg,na_neg]]),columns=columns)
            #合并非空和空样本
            merge_result=pd.concat([merge_result,tmpna],axis=0)
            merge_result['var_get']=var_get
            merge_result['var_empty']=na_stp
            merge_result['total']=total
            res.append(merge_result)
        res=pd.concat(res,axis=0)
        res=res.apply(pd.to_numeric,errors='ignore')
        # 是否空值单独统计
        # 计算指标
        res['get_rate']=res['var_get']/(res['total']+0.001) # 变量覆盖率
        res['intercept']=res['stp']/(res['total_stp']+0.001) # 拦截率
        res['precision']=res['pos']/(res['stp']+0.001) # 准确率
        res['recall']=res['pos']/(res['total_pos']+0.001)  #召回率
        res['disturb']=res['neg']/(res['total_neg']+0.001) #打扰率
        res['cum_precision']=res['cunsum_pos']/(res['cunsum_stp']+0.001)  # 累计准确率
        res['avg_precision']=res['total_pos']/(res['total_stp']+0.001) #均准确率
        res['cum_recall']=res['cunsum_pos']/(res['total_pos']+0.001) # 累计召回率
        res['cum_disturb']=res['cunsum_neg']/(res['total_neg']+0.001) # 累计打扰率
        res['bks']=abs(res['cum_recall']-res['cum_disturb']) # 区间ks值
        res=res.merge(res[res['flag']!='empty'].groupby('var_name')['bks'].max().to_frame().reset_index().rename(columns={'bks':'ks'}),how='left',on=['var_name'])                 
        rs=res.drop_duplicates(subset=None, keep='first', inplace=False) #去重
        # 返回列
        retcols=['var_name','flag','var_empty','var_get','total','get_rate','stp','pos','neg','cunsum_stp','cunsum_pos','cunsum_neg','total_stp','total_pos','total_neg','intercept','precision','recall','disturb','cum_precision','avg_precision','cum_recall','cum_disturb','bks','ks']
        rs=rs[retcols]
       
    # 修改指标名
        rs=res.drop_duplicates(subset=None, keep='first', inplace=False) #去重
        rs.rename(columns={'var_name':'变量'
                           ,'flag':'拦截区间'
                           ,'stp':'拦截样本数'
                           ,'pos':'黑样本数'
                           ,'neg':'白样本数'
                           ,'cunsum_pos':'累计黑样本数'
                           ,'cunsum_neg':'累计白样本数'
                           ,'cunsum_stp':'累计拦截数'
                           ,'intercept':'拦截率'
                           ,'precision':'准确率'
                           ,'recall':'召回率'
                           ,'disturb':'打扰率'
                           ,'cum_precision':'累计准确率'
                           ,'avg_precision':'平均准确率'
                           ,'cum_recall':'累计召回率'
                           ,'cum_disturb':'累计打扰率'
                           ,'bks':'ks区间值'
                           ,'ks':'ks值'
                          },inplace=True)
        
        return rs
        
    # 多组计算计算指标
    group_name='group_set'
    tmp=[]
    if not cols:
        cols=df.columns.tolist()
    if not groups:
        groups=['nan_group_set']
        df[groups[0]]='&&&'
    df[groups]=df[groups].astype(str)
    df[groups]=df[groups].astype(str).fillna('ng')
    if len(groups)>1:
        df['group_set']=df[groups[0]].str.cat([df[g] for g in groups[1:]],sep='_')
    else:
        df['group_set']=df[groups[0]]
    group_set=df['group_set'].unique()
    for k in group_set:
        df_new=df[df['group_set']==k][cols+[label]]
        res=define_mertrics(df_new,label=label,descs=descs)
        columns=res.columns.tolist()
        res[group_name]=k
        res=res[[group_name]+columns]
        tmp.append(res)       
    outs=pd.concat(tmp,axis=0)
    return outs
print('加载完毕,时间:',strftime("%Y-%m-%d %H:%M:%S", localtime()))

2、读取数据

data_changjing=pd.read_csv('*/全域mob3.csv')

#如果一个文件包含多个sheet的读取
data_Tencent_lfq=pd.read_excel('*.xlsx',sheet_name='测试')

#将两个读入的dataframe数据进行拼接,类似excel的vlookup,on中可以多个条件共同使用,也可以只用一个条件
data_new3=pd.merge(data_changjing,data_y,on=  ['num_id']) 

#将两个数据行拼接
data_new3=data_new3.append(data_new2)

#数据信息描述
data_new3.describe() #数据分布描述
data_new3.info()
data_new3.shape #数据类型,字段,空值等描述
data_new3.dtype #数据类型
data_new3[''].value_counts()#统计列内数据分布
data_new3.rename(columns={'score':'Bscore'},inplace=True)  #列修改名

data_new3=data_new3[['A','B','C','D','E']]#如果数据有A-G列,只选几列自己需要的数据
#groupby与聚合函数aggfunc
#groupby内的分组条件,可以是一个,也是是多个,多个时需要存储为list形式,groupby(['bins2'])   /groupby([A,B,C])
def ration(x):
    return x.sum()/x.count()

res2.groupby(['bins2'])['mob4'].agg(['count','sum',ration])

#agg内可以有count,min,max,mean,sum,或者自己定义

#数据筛选与显示
data1=data_new3[['num_id','fina_date','zhiye']]
data1.head()

数据的等频与等距分箱,并将分箱结果存到新列

#等频分箱
res2['bins']=pd.qcut(res2['score'],10,duplicates='drop',retbins=True)[0]

#等距分箱
res2['bins2']= pd.cut(res2['score'],10,duplicates='drop',retbins=True)[0]

数据透视

# pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['bins'],aggfunc=['count'],values=['mob4'],fill_value=0,margins=True, dropna=True)
pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['bins'],values=['mob4'],fill_value=0,aggfunc=['count']) #透视表功能,agg里面写函数就是按函数统计,不写就是统计values的占比

pd.pivot_table(result_with_overdue,index=['RI_CFV32_lvl'],columns=['mob4'],values=['num_id'],fill_value=0,aggfunc=['count'])

2、dataframe获取数据的列明

data_new3.columns

结果:
Index(['num_id', 'zhiye', 'weiyue', 'gongzhai', 'qingchang', 'zhuxing', 'lvyue', 'shouxin', 'xiaofei', 'xingqu', 'chengzhang', 'overdue'], dtype='object')
3、评分卡Python包的操作使用-scorecardpy

import scorecardpy as sc
#卡方分箱获取分箱结果
bins_new=sc.woebin(data_new3.loc[data_new3.overdue!=-1,['gongzhai','overdue']], y="overdue")
bins_new #分箱结果

#画出分箱结果分布
woebin_plot=sc.woebin_plot(bins_new)
woebin_plot

#分箱结果显示为dataframe形式
bins_new['gongzhai']

#对bins_new进行统计操作
bins_new['gongzhai']['count'].sum()

画图相关的函数调用

plt_data=Score_set_overdue[(Score_set_overdue['order_month_x']=='2019-11')&(Score_set_overdue['class_y']=='lfq')]
ks=PlotKS(plt_data['pred_score'],plt_data['overdue'],n=20,asc=True)
ks
plt.show
ks走势图.png
bins_new=sc.woebin(plt_data.loc[plt_data.overdue!=-1,['pred_score','overdue']], y="overdue")
woebin_plot=sc.woebin_plot(bins_new)
plt.figure(figsize=(5,5))
woebin_plot
qujianyuyuqi.png

多特征交叉与选择

# 读取数据
cross_data=Score_set_overdue[(Score_set_overdue['order_month_x']=='2020-01')&(Score_set_overdue['class_y']=='dae')]
# 指定不参与建模的变量,包含标签bad_ind。
ex_list=cross_data.drop(['Bscore','Tencent_score'],axis=1)

# 调用决策树函数
rules,graph = rule_tree(datasets=cross_data, ex_lis=ex_list,max_depth=2,dep='overdue', min_samples=0.01,
                        min_samples_leaf=200, min_samples_split=200).fit_plot()

Image(graph)
rules #查看选取规则
12.png

评分卡结果处理

outs_Bscore=evaluate_func1(Bscore_Score_set_overdue[Bscore_Score_set_overdue['class_x']!='None'],cols=['Bscore'],groups=['order_month','class_x'],label='overdue')
outs_Bscore[outs_Bscore['拦截区间']!='empty']
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 204,793评论 6 478
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 87,567评论 2 381
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 151,342评论 0 338
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 54,825评论 1 277
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 63,814评论 5 368
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 48,680评论 1 281
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 38,033评论 3 399
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 36,687评论 0 258
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 42,175评论 1 300
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 35,668评论 2 321
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 37,775评论 1 332
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 33,419评论 4 321
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 39,020评论 3 307
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 29,978评论 0 19
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 31,206评论 1 260
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 45,092评论 2 351
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 42,510评论 2 343