2019消费者人群画像—信用智能评分竞赛

消费者人群画像—信用智能评分竞赛

1. 比赛介绍

2. 参考代码

Packages

# -*- coding: utf-8 -*-
import time
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_absolute_error
plt.rcParams[u'font.sans-serif'] = ['simhei']  #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  #用来正常显示负号
pd.set_option('display.max_columns', 100)  #显示最大列数
import warnings
warnings.filterwarnings("ignore")

Input data

data_path = '../input/'
train_data = pd.read_csv(data_path + 'train_dataset.csv')
test_data = pd.read_csv(data_path + 'test_dataset.csv')
sample_sub = pd.read_csv(data_path + 'submit_example.csv')

Pre-processing

x_cols = [col for col in train_data.columns if col not in ['信用分','用户编码']]
labels = []
values = []
x_cols
['用户实名制是否通过核实',
 '用户年龄',
 '是否大学生客户',
 '是否黑名单客户',
 '是否4G不健康客户',
 '用户网龄(月)',
 '用户最近一次缴费距今时长(月)',
 '缴费用户最近一次缴费金额(元)',
 '用户近6个月平均消费值(元)',
 '用户账单当月总费用(元)',
 '用户当月账户余额(元)',
 '缴费用户当前是否欠费缴费',
 '用户话费敏感度',
 '当月通话交往圈人数',
 '是否经常逛商场的人',
 '近三个月月均商场出现次数',
 '当月是否逛过福州仓山万达',
 '当月是否到过福州山姆会员店',
 '当月是否看电影',
 '当月是否景点游览',
 '当月是否体育场馆消费',
 '当月网购类应用使用次数',
 '当月物流快递类应用使用次数',
 '当月金融理财类应用使用总次数',
 '当月视频播放类应用使用次数',
 '当月飞机类应用使用次数',
 '当月火车类应用使用次数',
 '当月旅游资讯类应用使用次数']
#判断各列和信用分的相关性
for col in x_cols:
    labels.append(col)
    values.append(
        np.corrcoef(train_data[col].values, train_data['信用分'].values)[0, 1])

corr_df = pd.DataFrame({'col_labels': labels, 'corr_values': values})
corr_df = corr_df.sort_values(by='corr_values')

ind = np.arange(len(labels))
width = 0.5
fig, ax = plt.subplots(figsize=(12, 60))
rects = ax.barh(ind, np.array(corr_df.corr_values.values), color='y')
ax.set_yticks(ind)
ax.set_yticklabels(corr_df.col_labels.values, rotation='horizontal')
ax.set_xlabel('Correlation coefficient')
ax.set_title('Correlation coeficient of the variables')
Text(0.5,1,'Correlation coeficient of the variables')

#密度曲线
def plot_kde(data):
    plt.figure(figsize=(8, 6))
    data.plot(kind='kde')


def plot_his(data):
    plt.figure(figsize=(8, 6))
    sns.distplot(data.values, bins=50, kde=False)


plot_kde(train_data['信用分'])

Feature Engineering

# top up amount, 充值金额是整数,和小数,应该对应不同的充值途径?
def produce_offline_feature(train_data):
    train_data['不同充值途径'] = 0
    train_data['不同充值途径'][(train_data['缴费用户最近一次缴费金额(元)'] %
                          10 == 0) & train_data['缴费用户最近一次缴费金额(元)'] != 0] = 1
    return train_data
train_data = produce_offline_feature(train_data)
test_data = produce_offline_feature(test_data)
# 看importance,当月话费 和最近半年平均话费都很高,算一下当月/半年 -->稳定性
def produce_fee_rate(train_data):
    train_data['当前费用稳定性'] = train_data['用户账单当月总费用(元)'] / \
        (train_data['用户近6个月平均消费值(元)'] + 1)

    # 当月话费/当月账户余额
    train_data['用户余额比例'] = train_data['用户账单当月总费用(元)'] / \
        (train_data['用户当月账户余额(元)'] + 1)
    return train_data


train_data = produce_offline_feature(train_data)
test_data = produce_offline_feature(test_data)
# 获取特征
def get_features(data):

    data.loc[data['用户年龄'] ==
             0, '用户年龄'] = data['用户年龄'].mode()  # mode()选出一组数据中众数,这里改为平均值
    data.loc[data['缴费用户最近一次缴费金额(元)'] ==
             0, '缴费用户最近一次缴费金额(元)'] = data['缴费用户最近一次缴费金额(元)'].mode()
    data['缴费金额是否能覆盖当月账单'] = data['缴费用户最近一次缴费金额(元)'] - data['用户账单当月总费用(元)']
    data['最近一次缴费是否超过平均消费额'] = data['缴费用户最近一次缴费金额(元)'] - data['用户近6个月平均消费值(元)']
    data['当月账单是否超过平均消费额'] = data['用户账单当月总费用(元)'] - data['用户近6个月平均消费值(元)']

    # 映射年龄
    def map_age(x):
        if x <= 1="" 2="" 3="" 4="" 5="" 18:="" return="" elif="" x="" <="30:" else:="" data['是否大学生_黑名单']="data['是否大学生客户']" +="" data['是否黑名单客户']="" data['是否实名制_大学生']="data['是否大学生客户']" data['用户实名制是否通过核实']="" data['是否实名制_大学生_黑名单']="data['是否黑名单客户']" data['是否大学生客户']="" data['是否实名制_黑名单']="data['是否黑名单客户']" data['是否去过高档商场']="data['当月是否到过福州山姆会员店']" data['当月是否逛过福州仓山万达']="" x:="" if="">= 1 else 0)
    data['是否_商场_电影'] = data['是否去过高档商场'] * data['当月是否看电影']
    data['是否_商场_体育馆'] = data['是否去过高档商场'] * data['当月是否体育场馆消费']
    data['是否_商场_旅游'] = data['是否去过高档商场'] * data['当月是否景点游览']
    data['是否_电影_体育馆'] = data['当月是否看电影'] * data['当月是否体育场馆消费']
    data['是否_电影_旅游'] = data['当月是否看电影'] * data['当月是否景点游览']
    data['是否_旅游_体育馆'] = data['当月是否景点游览'] * data['当月是否体育场馆消费']

    data['是否_商场_旅游_体育馆'] = data['是否去过高档商场'] * data['当月是否景点游览'] * data[
        '当月是否体育场馆消费']
    data['是否_商场_电影_体育馆'] = data['是否去过高档商场'] * data['当月是否看电影'] * data[
        '当月是否体育场馆消费']
    data['是否_商场_电影_旅游'] = data['是否去过高档商场'] * data['当月是否看电影'] * data['当月是否景点游览']
    data['是否_体育馆_电影_旅游'] = data['当月是否体育场馆消费'] * data['当月是否看电影'] * data[
        '当月是否景点游览']
    data['是否_商场_体育馆_电影_旅游'] = data['是否去过高档商场'] * \
        data['当月是否体育场馆消费'] * data['当月是否看电影'] * data['当月是否景点游览']

    discretize_features = [
        '交通类应用使用次数', '当月物流快递类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数',
        '当月旅游资讯类应用使用次数'
    ]
    data['交通类应用使用次数'] = data['当月飞机类应用使用次数'] + data['当月火车类应用使用次数']

    data['6个月平均占比总费用'] = data['用户近6个月平均消费值(元)'] / data['用户账单当月总费用(元)'] + 1

    def map_discretize(x):
        if x == 0:
            return 0
        elif x <= 1="" 2="" 3="" 4="" 5="" 5:="" return="" elif="" x="" <="15:" else:="" for="" col="" in="" discretize_features[:]:="" data[col]="data[col].map(lambda" x:="" map_discretize(x))="" data="" train_data="get_features(train_data)" test_data="get_features(test_data)" 
def base_process(data):
    transform_value_feature = [
        '用户年龄', '用户网龄(月)', '当月通话交往圈人数', '近三个月月均商场出现次数', '当月网购类应用使用次数',
        '当月物流快递类应用使用次数', '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '当月飞机类应用使用次数',
        '当月火车类应用使用次数', '当月旅游资讯类应用使用次数'
    ]
    user_fea = [
        '缴费用户最近一次缴费金额(元)', '用户近6个月平均消费值(元)', '用户账单当月总费用(元)', '用户当月账户余额(元)'
    ]
    log_features = [
        '当月网购类应用使用次数', '当月金融理财类应用使用总次数', '当月物流快递类应用使用次数', '当月视频播放类应用使用次数'
    ]
    #处理离散点
    for col in transform_value_feature + user_fea + log_features:
        #取出最高99.9%值
        ulimit = np.percentile(train_data[col].values, 99.9)
        #取出最低0.1%值
        llimit = np.percentile(train_data[col].values, 0.1)
        train_data.loc[train_data[col] > ulimit, col] = ulimit
        train_data.loc[train_data[col] < llimit, col] = llimit
    for col in user_fea + log_features:
        data[col] = data[col].map(lambda x: np.log1p(x))
    return data


train_data = base_process(train_data)
test_data = base_process(test_data)

Training

def display_importances(feature_importance_df_):
    cols = feature_importance_df_[[
        "feature", "importance"
    ]].groupby("feature").mean().sort_values(
        by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[
        feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(
        x="importance",
        y="feature",
        data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.show()
#para
params1 = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'regression_l1',
    'metric': 'mae',
    'feature_fraction': 0.6,  #如果设置为 0.8, 将会在每棵树训练之前选择 80% 的特征
    'bagging_fraction': 0.8,  # 它将在不进行重采样的情况下随机选择部分数据
    'bagging_freq': 2,  #意味着每2次迭代执行bagging
    'num_leaves': 31,
    'verbose': -1,
    'max_depth': 5,
    'lambda_l2': 5,
    'lambda_l1': 0,
    'num_thread': 8
}
#para
params2 = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'regression_l2',
    'metric': 'mae',
    'feature_fraction': 0.6,  #如果设置为 0.8, 将会在每棵树训练之前选择 80% 的特征
    'bagging_fraction': 0.8,  # 它将在不进行重采样的情况下随机选择部分数据
    'bagging_freq': 2,  #意味着每2次迭代执行bagging
    'num_leaves': 31,
    'verbose': -1,
    'max_depth': 5,
    'lambda_l2': 5,
    'lambda_l1': 0,
    'num_thread': 8
}
cv_pred_all1 = 0
en_amount = 3

oof_lgb1 = np.zeros(len(train_data))
prediction_lgb1 = np.zeros(len(test_data))

for seed in range(en_amount):
    NFOLDS = 5
    train_label = train_data['信用分']

    kfold = KFold(n_splits=NFOLDS, shuffle=True, random_state=seed)
    kf = kfold.split(train_data, train_label)

    train_data_use = train_data.drop(['用户编码', '信用分'], axis=1)
    test_data_use = test_data.drop(['用户编码'], axis=1)

    cv_pred = np.zeros(test_data.shape[0])
    valid_best_l2_all = 0

    feature_importance_df = pd.DataFrame()
    count = 0
    for i, (train_fold, validate) in enumerate(kf):
        print('fold: ', i, ' training')
        X_train, X_validate, label_train, label_validate = \
        train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \
        train_label[train_fold], train_label[validate]
        dtrain = lgb.Dataset(X_train, label_train)
        dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)
        bst = lgb.train(
            params1,
            dtrain,
            num_boost_round=10000,
            valid_sets=dvalid,
            verbose_eval=-1,
            early_stopping_rounds=250)
        cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)
        valid_best_l2_all += bst.best_score['valid_0']['l1']

        oof_lgb1[validate] = bst.predict(
            X_validate, num_iteration=bst.best_iteration)
        prediction_lgb1 += bst.predict(
            test_data_use, num_iteration=bst.best_iteration) / kfold.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = list(X_train.columns)
        fold_importance_df["importance"] = bst.feature_importance(
            importance_type='split', iteration=bst.best_iteration)
        fold_importance_df["fold"] = count + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        count += 1

    cv_pred /= NFOLDS
    valid_best_l2_all /= NFOLDS

    cv_pred_all1 += cv_pred
cv_pred_all1 /= en_amount
prediction_lgb1 /= en_amount
print('cv score1 for valid is: ', 1 / (1 + valid_best_l2_all))
fold:  0  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[3874]    valid_0's l1: 14.7271
fold:  1  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[3908]    valid_0's l1: 14.7613
fold:  2  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[3308]    valid_0's l1: 14.5285
fold:  3  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[5434]    valid_0's l1: 14.5385
fold:  4  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[3310]    valid_0's l1: 14.73
fold:  0  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[3939]    valid_0's l1: 14.6399
fold:  1  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[2806]    valid_0's l1: 14.6524
fold:  2  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[4375]    valid_0's l1: 14.5624
fold:  3  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[4439]    valid_0's l1: 14.7281
fold:  4  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[4375]    valid_0's l1: 14.7686
fold:  0  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[4424]    valid_0's l1: 14.7947
fold:  1  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[5509]    valid_0's l1: 14.8132
fold:  2  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[2970]    valid_0's l1: 14.4896
fold:  3  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[4146]    valid_0's l1: 14.6055
fold:  4  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[3937]    valid_0's l1: 14.5792
cv score1 for valid is:  0.06387148777659804
cv_pred_all2 = 0
en_amount = 3

oof_lgb1 = np.zeros(len(train_data))
prediction_lgb2 = np.zeros(len(test_data))

for seed in range(en_amount):
    NFOLDS = 5
    train_label = train_data['信用分']

    kfold = KFold(n_splits=NFOLDS, shuffle=True, random_state=seed)
    kf = kfold.split(train_data, train_label)

    train_data_use = train_data.drop(['用户编码', '信用分'], axis=1)
    test_data_use = test_data.drop(['用户编码'], axis=1)

    cv_pred = np.zeros(test_data.shape[0])
    valid_best_l2_all = 0

    feature_importance_df = pd.DataFrame()
    count = 0
    for i, (train_fold, validate) in enumerate(kf):
        print('fold: ', i, ' training')
        X_train, X_validate, label_train, label_validate = \
        train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \
        train_label[train_fold], train_label[validate]
        dtrain = lgb.Dataset(X_train, label_train)
        dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)
        bst = lgb.train(
            params2,
            dtrain,
            num_boost_round=10000,
            valid_sets=dvalid,
            verbose_eval=-1,
            early_stopping_rounds=250)
        cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)
        valid_best_l2_all += bst.best_score['valid_0']['l1']

        oof_lgb1[validate] = bst.predict(
            X_validate, num_iteration=bst.best_iteration)
        prediction_lgb2 += bst.predict(
            test_data_use, num_iteration=bst.best_iteration) / kfold.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = list(X_train.columns)
        fold_importance_df["importance"] = bst.feature_importance(
            importance_type='split', iteration=bst.best_iteration)
        fold_importance_df["fold"] = count + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        count += 1

    cv_pred /= NFOLDS
    valid_best_l2_all /= NFOLDS

    cv_pred_all2 += cv_pred
cv_pred_all2 /= en_amount
prediction_lgb2 /= en_amount
print('cv2 score for valid is: ', 1 / (1 + valid_best_l2_all))
fold:  0  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[4423]    valid_0's l1: 14.7215
fold:  1  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[3077]    valid_0's l1: 14.7534
fold:  2  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[3277]    valid_0's l1: 14.5732
fold:  3  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[4180]    valid_0's l1: 14.5316
fold:  4  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[3139]    valid_0's l1: 14.7765
fold:  0  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[2928]    valid_0's l1: 14.5982
fold:  1  training
Training until validation scores don't improve for 250 rounds.
Early stopping, best iteration is:
[2344]    valid_0's l1: 14.7204
fold:  2  training
Training until validation scores don't improve for 250 rounds.
display_importances(feature_importance_df)

XGB 训练及预测

xgb_params = {'eta': 0.005, 'max_depth': 10, 'subsample': 0.8, 'colsample_bytree': 0.8,
              'objective': 'reg:linear', 'eval_metric': 'mae', 'silent': True, 'nthread': 20}
cv_pred_allxgb = 0
en_amount = 3
oof_xgb1 = np.zeros(len(train_data))
prediction_xgb1 = np.zeros(len(test_data))
for seed in range(en_amount):
    NFOLDS = 5
    train_label = train_data['信用分']
    kfold = KFold(n_splits=NFOLDS, shuffle=True, random_state=seed+2019)
    kf = kfold.split(train_data, train_label)

    train_data_use = train_data.drop(['用户编码', '信用分'], axis=1)
    test_data_use = test_data.drop(['用户编码'], axis=1)

    cv_pred = np.zeros(test_data.shape[0])
    valid_best_l2_all = 0

    feature_importance_df = pd.DataFrame()
    count = 0

    for i, (train_fold, validate) in enumerate(kf):
        print('fold: ', i, ' training')
        X_train, X_validate, label_train, label_validate = train_data_use.iloc[train_fold,
                                                                               :], train_data_use.iloc[validate, :], train_label[train_fold], train_label[validate]
        dtrain = xgb.DMatrix(X_train, label_train)
        dvalid = xgb.DMatrix(X_validate, label_validate)
        watchlist = [(dtrain, 'train'), (dvalid, 'valid_data')]
        bst = xgb.train(dtrain=dtrain, num_boost_round=10000, evals=watchlist,
                        early_stopping_rounds=100, verbose_eval=300, params=xgb_params)
        cv_pred += bst.predict(xgb.DMatrix(test_data_use),
                               ntree_limit=bst.best_ntree_limit)
        oof_xgb1[validate] = bst.predict(xgb.DMatrix(
            X_validate), ntree_limit=bst.best_ntree_limit)
        prediction_xgb1 += bst.predict(xgb.DMatrix(test_data_use),
                                       ntree_limit=bst.best_ntree_limit)/kfold.n_splits
        count += 1

    cv_pred /= NFOLDS
    cv_pred_allxgb += cv_pred
cv_pred_allxgb /= en_amount
prediction_xgb1 /= en_amount
prediction_lgb_all = (prediction_lgb1 + prediction_lgb2) / 2
# 将lgb和xgb的结果进行stacking
train_stack = np.vstack([oof_lgb1, oof_xgb1]).transpose()
test_stack = np.vstack([prediction_lgb_all, prediction_xgb1]).transpose()

folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2019)
oof_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])
target = train_data['信用分']
for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, target)):
    print("fold {}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

    clf_3 = BayesianRidge()
    clf_3.fit(trn_data, trn_y)

    oof_stack[val_idx] = clf_3.predict(val_data)
    predictions += clf_3.predict(test_stack) / 10

mean_absolute_error(target.values, oof_stack)

Submit

test_data_sub1 = test_data[['用户编码']]
test_data_sub1['score'] =  predictions
test_data_sub1.columns = ['id','score']
# test_data_sub1['score1'] = (cv_pred_all1 + cv_pred_all2) / 2
test_data_sub1.head()
test_data_sub1['score'] = test_data_sub1['score'].apply(lambda x: int(np.round(x)))
test_data_sub1.head()
test_data_sub1[['id','score']].to_csv('../output/result.csv', index=False)