1 Star 1 Fork 0

姜山 / credit-card-intention-prediction

Create your Gitee Account
Explore and code with more than 12 million developers,Free private repositories !:)
Sign up
Clone or Download
main.py 10.44 KB
Copy Edit Raw Blame History
姜山 authored 2021-11-15 00:23 . final push
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
import preprocess as ps
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# print(ps.missing_rate(train))
train = train.drop(['ID'], axis=1)
test = test.drop(['ID'], axis=1)
# train = train.drop(['Avg_Account_Balance'], axis=1)
# train = train.dropna()
# train['Gender'] = train['Gender'].replace({'Female': 0, 'Male': 1})
# train['Region_Code'] = train['Region_Code'].str.replace('RG', '').astype('int64')
# train['Occupation'] = train['Occupation'].replace({'Self_Employed': 1, 'Salaried': 2, 'Other': 0, 'Entrepreneur': 3})
# train['Channel_Code'] = train['Channel_Code'].str.replace('X', '').astype('int64')
# train['Is_Active'] = train['Is_Active'].replace({'No': 0, 'Yes': 1})
# train['Credit_Product'] = train['Credit_Product'].replace({'No': 0, 'Yes': 1})
def conv_dict(train):
dic = {}
for x in train:
if train[x].dtype != 'int64':
tmp = train[[x, 'Is_Lead']].groupby(x, as_index=False).mean().sort_values(by='Is_Lead')
dic[x] = {x: i for i, x in enumerate(list(np.array(tmp)[:, 0]))}
return dic
def conv_data(train, dic):
for x in train:
if train[x].dtype != 'int64':
train[x] = train[x].replace(dic[x])
return train
# train = MinMaxScaler(feature_range=(-1, 1)).fit_transform(train)
# train_dict = conv_dict(train)
# train = conv_data(train, train_dict)
# train = train.dropna()
# train['Gender'] = train['Gender'].replace({'Female': 0, 'Male': 1})
# train['Is_Active'] = train['Is_Active'].replace({'No': 0, 'Yes': 1})
# train['Credit_Product'] = train['Credit_Product'].replace({'No': 0, 'Yes': 1})
# train['Is_Lead'] = train['Is_Lead'].replace({0: 'No', 1: 'Yes'})
train.fillna(value='Missing', inplace=True)
test.fillna(value='Missing', inplace=True)
print(train.info())
print(test.info())
y = np.array(train['Is_Lead'])
x = np.array(train.drop('Is_Lead', 1))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=18)
# over_sampler = SMOTE(random_state=0)
# x_train, y_train = over_sampler.fit_resample(x_train, y_train)
# # 过采样个数
# print(len(os_labels[os_labels == 0]), len(os_labels[os_labels == 1]))
#
# count_classes = pd.value_counts(train['Is_Lead'], sort=True).sort_index()
# count_classes.plot(kind="bar")
# print(count_classes)
# params = {
# 'booster': 'gbtree',
# 'objective': 'binary:logistic',
# 'eval_metric': 'auc',
# 'max_depth': 10,
# 'lambda': 10,
# 'subsample': 0.85,
# 'colsample_bytree': 0.85,
# 'min_child_weight': 0,
# 'eta': 0.1,
# 'seed': 0,
# 'nthread': 8,
# 'silent': 1
# }
#
# xgb_train = xgb.DMatrix(x, label=y)
# xgb_test = xgb.DMatrix(x, label=y)
# watchlist = [(xgb_train, 'train'), (xgb_test, 'test')]
#
# num_round = 500
# bst = xgb.train(params, xgb_train, num_round, watchlist)
# bst.save_model('test.model')
# pred = bst.predict(xgb_test)
# print('predicting, classification error=%f'
# % (sum(int(pred[i]) != y[i] for i in range(len(y))) / float(len(y))))
#
# y_pred = (pred >= 0.5) * 1
# print('AUC: %.4f' % metrics.roc_auc_score(y, pred))
# print('ACC: %.4f' % metrics.accuracy_score(y, y_pred))
# print('Recall: %.4f' % metrics.recall_score(y, y_pred))
# print('F1-score: %.4f' % metrics.f1_score(y, y_pred))
# print('Precesion: %.4f' % metrics.precision_score(y, y_pred))
# print(metrics.confusion_matrix(y, y_pred))
# def tuning(cv_params, other_params, x, y):
# model = xgb.XGBClassifier(**other_params)
# optimized_model = GridSearchCV(estimator=model,
# param_grid=cv_params,
# scoring='roc_auc',
# cv=10,
# n_jobs=-1)
# optimized_model.fit(x, y)
# result = optimized_model.cv_results_['mean_test_score']
# print('每轮迭代结果:', result)
# print('最佳参数:', optimized_model.best_params_)
# print('最佳模型得分:', format(optimized_model.best_score_))
# return optimized_model
#
#
# other_params = {
# 'booster': 'gbtree',
# 'eval_metric': 'auc',
# 'objective': 'binary:logistic',
# 'n_estimators': 125,
# 'eta': 0.01,
# 'learning_rate': 0.1,
# 'max_depth': 5,
# 'min_child_weight': 6,
# 'seed': 0,
# 'subsample': 0.85,
# 'colsample_bytree': 0.8,
# 'gamma': 0.8,
# 'reg_alpha': 0.4,
# 'reg_lambda': 0.7
# }
# cv_params = {
# 'reg_alpha': [0.2, 0.3, 0.4, 0.5, 0.6],
# 'reg_lambda': [0.5, 0.6, 0.7, 0.8, 0.9]
# }
# opt = tuning(cv_params, other_params, x, y)
#
# y_pred = opt.best_estimator_.predict(x)
# print('AUC: %.4f' % metrics.roc_auc_score(y, y_pred))
# print('ACC: %.4f' % metrics.accuracy_score(y, y_pred))
# print('Recall: %.4f' % metrics.recall_score(y, y_pred))
# print('F1-score: %.4f' % metrics.f1_score(y, y_pred))
# print('Precesion: %.4f' % metrics.precision_score(y, y_pred))
# print(metrics.confusion_matrix(y, y_pred))
from catboost import Pool, CatBoostClassifier
import numpy as np
categories = [0, 2, 3, 4, 6, 8]
train_dataset = Pool(data=x_train,
label=y_train,
cat_features=categories)
eval_dataset = Pool(data=x_test,
label=y_test,
cat_features=categories)
model = CatBoostClassifier(iterations=10000,
early_stopping_rounds=500,
learning_rate=0.01,
loss_function='Logloss',
logging_level='Verbose',
random_seed=18,
metric_period=500,
od_wait=500,
max_depth=10,
# colsample_bylevel=0.7,
# l2_leaf_reg=10,
# border_count=128,
# max_ctr_complexity=5,
# bagging_temperature=0,
# random_strength=10,
# simple_ctr=['Borders', 'Buckets', 'FloatTargetMeanValue', 'FeatureFreq'],
# combinations_ctr=['Borders', 'Buckets', 'FloatTargetMeanValue', 'FeatureFreq'],
# nan_mode='Max',
boosting_type='Ordered',
od_type='Iter',
eval_metric='AUC',
task_type='GPU',
devices='0:1',
# ignored_features=[0, 2],
)
model.fit(train_dataset, eval_set=eval_dataset, logging_level='Verbose')
preds_class = model.predict(eval_dataset)
preds_probs = model.predict_proba(eval_dataset)[:, 1]
print(model.feature_importances_)
print('AUC: %.4f' % metrics.roc_auc_score(y_test, preds_probs))
print('ACC: %.4f' % metrics.accuracy_score(y_test, preds_class))
print('Recall: %.4f' % metrics.recall_score(y_test, preds_class))
print('F1-score: %.4f' % metrics.f1_score(y_test, preds_class))
print('Precesion: %.4f' % metrics.precision_score(y_test, preds_class))
test_arr = np.array(test)
preds_test = model.predict_proba(test_arr)[:, 1]
np.savetxt('./ans.csv', preds_test, fmt='%.8f', delimiter=',')
# model = CatBoostClassifier(iterations=1000,
# early_stopping_rounds=200,
# learning_rate=0.1,
# loss_function='Logloss',
# logging_level='Verbose',
# random_seed=18,
# max_depth=12,
# # l2_leaf_reg=10,
# border_count=128,
# max_ctr_complexity=5,
# # bagging_temperature=0,
# # random_strength=10,
# simple_ctr=['Borders', 'Buckets', 'FloatTargetMeanValue', 'FeatureFreq'],
# combinations_ctr=['Borders', 'Buckets', 'FloatTargetMeanValue', 'FeatureFreq'],
# nan_mode='Max',
# boosting_type='Ordered',
# od_type='Iter',
# eval_metric='AUC',
# task_type='GPU',
# devices='0:1',
# # ignored_features=[0, 2],
# )
# model.fit(x_train, y_train, cat_features=[0, 2, 3, 4, 8, 9], eval_set=(x_test, y_test),
# logging_level='Verbose')
# preds_class = model.predict(x_test)
# preds_probs = model.predict_proba(x_test)[:, 1]
#
# print(model.feature_importances_)
# print('AUC: %.4f' % metrics.roc_auc_score(y_test, preds_probs))
# print('ACC: %.4f' % metrics.accuracy_score(y_test, preds_class))
# print('Recall: %.4f' % metrics.recall_score(y_test, preds_class))
# print('F1-score: %.4f' % metrics.f1_score(y_test, preds_class))
# print('Precesion: %.4f' % metrics.precision_score(y_test, preds_class))
# category_features = [0, 2, 3, 4, 6, 8]
# cv_params = {'loss_function': ['Logloss', 'CrossEntropy']}
# other_params = {
# 'task_type': 'CPU',
# 'thread_count': -1,
# 'custom_metric': 'AUC',
# 'eval_metric': 'AUC',
# 'logging_level': 'Verbose',
# 'loss_function': 'Logloss',
# 'iterations': 1000,
# 'learning_rate': 0.1,
# 'depth': 2,
# }
# model = CatBoostClassifier(**other_params)
# optimized_model = GridSearchCV(estimator=model, param_grid=cv_params, scoring='roc_auc', cv=10, verbose=1, n_jobs=-1)
# optimized_model.fit(x_train, y_train, cat_features=category_features)
# print('参数的最佳取值:', optimized_model.best_params_)
# print('最佳模型得分:', optimized_model.best_score_)
# print(optimized_model.cv_results_['mean_test_score'])
# print(optimized_model.cv_results_['params'])
#
# preds_class = optimized_model.predict(x_test)
# preds_probs = optimized_model.predict_proba(x_test)[:, 1]
#
# print('AUC: %.4f' % metrics.roc_auc_score(y_test, preds_probs))
# print('ACC: %.4f' % metrics.accuracy_score(y_test, preds_class))
# print('Recall: %.4f' % metrics.recall_score(y_test, preds_class))
# print('F1-score: %.4f' % metrics.f1_score(y_test, preds_class))
# print('Precesion: %.4f' % metrics.precision_score(y_test, preds_class))
Python
1
https://gitee.com/buptsg2019/credit-card-intention-prediction.git
git@gitee.com:buptsg2019/credit-card-intention-prediction.git
buptsg2019
credit-card-intention-prediction
credit-card-intention-prediction
master

Search

53164aa7 5694891 3bd8fe86 5694891