排序模型
通過召回的操作, 我們已經進行了問題規模的縮減, 對于每個用戶, 選擇出了N篇文章作為了候選集,并基于召回的候選集構建了與用戶歷史相關的特征,以及用戶本身的屬性特征,文章本省的屬性特征,以及用戶與文章之間的特征,下面就是使用機器學習模型來對構造好的特征進行學習,然后對測試集進行預測,得到測試集中的每個候選集用戶點擊的概率,返回點擊概率最大的topk個文章,作為最終的結果。
排序階段選擇了三個比較有代表性的排序模型,它們分別是:
- LGB的排序模型
- LGB的分類模型
- 深度學習的分類模型DIN
得到了最終的排序模型輸出的結果之后,還選擇了兩種比較經典的模型集成的方法:
- 輸出結果加權融合
- Staking(將模型的輸出結果再使用一個簡單模型進行預測)
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import gc, os
import time
from datetime import datetime
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
讀取排序特征
data_path = './data_raw/'
save_path = './temp_results/'
offline = False
# 重新讀取數據的時候,發現click_article_id是一個浮點數,所以將其轉換成int類型
trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')
trn_user_item_feats_df['click_article_id'] = trn_user_item_feats_df['click_article_id'].astype(int)
if offline:
val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv')
val_user_item_feats_df['click_article_id'] = val_user_item_feats_df['click_article_id'].astype(int)
else:
val_user_item_feats_df = None
tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')
tst_user_item_feats_df['click_article_id'] = tst_user_item_feats_df['click_article_id'].astype(int)
# 做特征的時候為了方便,給測試集也打上了一個無效的標簽,這里直接刪掉就行
del tst_user_item_feats_df['label']
返回排序后的結果
def submit(recall_df, topk=5, model_name=None):
recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
# 判斷是不是每個用戶都有5篇文章及以上
tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
assert tmp.min() >= topk
del recall_df['pred_score']
submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
# 按照提交格式定義列名
submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2',
3: 'article_3', 4: 'article_4', 5: 'article_5'})
save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'
submit.to_csv(save_name, index=False, header=True)
# 排序結果歸一化
def norm_sim(sim_df, weight=0.0):
# print(sim_df.head())
min_sim = sim_df.min()
max_sim = sim_df.max()
if max_sim == min_sim:
sim_df = sim_df.apply(lambda sim: 1.0)
else:
sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim))
sim_df = sim_df.apply(lambda sim: sim + weight) # plus one
return sim_df
LGB排序模型
# 防止中間出錯之后重新讀取數據
trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy()
if offline:
val_user_item_feats_df_rank_model = val_user_item_feats_df.copy()
tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()
# 定義特征列
lgb_cols = ['sim0', 'time_diff0', 'word_diff0','sim_max', 'sim_min', 'sim_sum',
'sim_mean', 'score','click_size', 'time_diff_mean', 'active_level',
'click_environment','click_deviceGroup', 'click_os', 'click_country',
'click_region','click_referrer_type', 'user_time_hob1', 'user_time_hob2',
'words_hbo', 'category_id', 'created_at_ts','words_count']
# 排序模型分組
trn_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)
g_train = trn_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values
if offline:
val_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)
g_val = val_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values
# 排序模型定義
lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)
# 排序模型訓練
if offline:
lgb_ranker.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'], group=g_train,
eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])],
eval_group= [g_val], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )
else:
lgb_ranker.fit(trn_user_item_feats_df[lgb_cols], trn_user_item_feats_df['label'], group=g_train)
# 模型預測
tst_user_item_feats_df['pred_score'] = lgb_ranker.predict(tst_user_item_feats_df[lgb_cols], num_iteration=lgb_ranker.best_iteration_)
# 將這里的排序結果保存一份,用戶后面的模型融合
tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_ranker_score.csv', index=False)
# 預測結果重新排序, 及生成提交結果
rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_ranker')
# 五折交叉驗證,這里的五折交叉是以用戶為目標進行五折劃分
# 這一部分與前面的單獨訓練和驗證是分開的
def get_kfold_users(trn_df, n=5):
user_ids = trn_df['user_id'].unique()
user_set = [user_ids[i::n] for i in range(n)]
return user_set
k_fold = 5
trn_df = trn_user_item_feats_df_rank_model
user_set = get_kfold_users(trn_df, n=k_fold)
score_list = []
score_df = trn_df[['user_id', 'click_article_id','label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])
# 五折交叉驗證,并將中間結果保存用于staking
for n_fold, valid_user in enumerate(user_set):
train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
# 訓練集與驗證集的用戶分組
train_idx.sort_values(by=['user_id'], inplace=True)
g_train = train_idx.groupby(['user_id'], as_index=False).count()["label"].values
valid_idx.sort_values(by=['user_id'], inplace=True)
g_val = valid_idx.groupby(['user_id'], as_index=False).count()["label"].values
# 定義模型
lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16)
# 訓練模型
lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train,
eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val],
eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )
# 預測驗證集結果
valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_)
# 對輸出結果進行歸一化
valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
valid_idx.sort_values(by=['user_id', 'pred_score'])
valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
# 將驗證集的預測結果放到一個列表中,后面進行拼接
score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
# 如果是線上測試,需要計算每次交叉驗證的結果相加,最后求平均
if not offline:
sub_preds += lgb_ranker.predict(tst_user_item_feats_df_rank_model[lgb_cols], lgb_ranker.best_iteration_)
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存訓練集交叉驗證產生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_ranker_feats.csv', index=False)
# 測試集的預測結果,多次交叉驗證求平均,將預測的score和對應的rank特征保存,可以用于后面的staking,這里還可以構造其他更多的特征
tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
# 保存測試集交叉驗證的新特征
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_ranker_feats.csv', index=False)
# 預測結果重新排序, 及生成提交結果
# 單模型生成提交結果
rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_ranker')
LGB分類模型
# 模型及參數的定義
lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
max_depth=-1, n_estimators=500, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)
# 模型訓練
if offline:
lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'],
eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])],
eval_metric=['auc', ],early_stopping_rounds=50, )
else:
lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'])
# 模型預測
tst_user_item_feats_df['pred_score'] = lgb_Classfication.predict_proba(tst_user_item_feats_df[lgb_cols])[:,1]
# 將這里的排序結果保存一份,用戶后面的模型融合
tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_cls_score.csv', index=False)
# 預測結果重新排序, 及生成提交結果
rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_cls')
# 五折交叉驗證,這里的五折交叉是以用戶為目標進行五折劃分
# 這一部分與前面的單獨訓練和驗證是分開的
def get_kfold_users(trn_df, n=5):
user_ids = trn_df['user_id'].unique()
user_set = [user_ids[i::n] for i in range(n)]
return user_set
k_fold = 5
trn_df = trn_user_item_feats_df_rank_model
user_set = get_kfold_users(trn_df, n=k_fold)
score_list = []
score_df = trn_df[['user_id', 'click_article_id', 'label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])
# 五折交叉驗證,并將中間結果保存用于staking
for n_fold, valid_user in enumerate(user_set):
train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
# 模型及參數的定義
lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10)
# 訓練模型
lgb_Classfication.fit(train_idx[lgb_cols], train_idx['label'],eval_set=[(valid_idx[lgb_cols], valid_idx['label'])],
eval_metric=['auc', ],early_stopping_rounds=50, )
# 預測驗證集結果
valid_idx['pred_score'] = lgb_Classfication.predict_proba(valid_idx[lgb_cols],
num_iteration=lgb_Classfication.best_iteration_)[:,1]
# 對輸出結果進行歸一化 分類模型輸出的值本身就是一個概率值不需要進行歸一化
# valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))
valid_idx.sort_values(by=['user_id', 'pred_score'])
valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
# 將驗證集的預測結果放到一個列表中,后面進行拼接
score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
# 如果是線上測試,需要計算每次交叉驗證的結果相加,最后求平均
if not offline:
sub_preds += lgb_Classfication.predict_proba(tst_user_item_feats_df_rank_model[lgb_cols],
num_iteration=lgb_Classfication.best_iteration_)[:,1]
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存訓練集交叉驗證產生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_cls_feats.csv', index=False)
# 測試集的預測結果,多次交叉驗證求平均,將預測的score和對應的rank特征保存,可以用于后面的staking,這里還可以構造其他更多的特征
tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
# 保存測試集交叉驗證的新特征
tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_cls_feats.csv', index=False)
# 預測結果重新排序, 及生成提交結果
rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]
rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)
submit(rank_results, topk=5, model_name='lgb_cls')
DIN模型
用戶的歷史點擊行為列表
這個是為后面的DIN模型服務的
if offline:
all_data = pd.read_csv('./data_raw/train_click_log.csv')
else:
trn_data = pd.read_csv('./data_raw/train_click_log.csv')
tst_data = pd.read_csv('./data_raw/testA_click_log.csv')
all_data = trn_data.append(tst_data)
hist_click =all_data[['user_id', 'click_article_id']].groupby('user_id').agg({list}).reset_index()
his_behavior_df = pd.DataFrame()
his_behavior_df['user_id'] = hist_click['user_id']
his_behavior_df['hist_click_article_id'] = hist_click['click_article_id']
trn_user_item_feats_df_din_model = trn_user_item_feats_df.copy()
if offline:
val_user_item_feats_df_din_model = val_user_item_feats_df.copy()
else:
val_user_item_feats_df_din_model = None
tst_user_item_feats_df_din_model = tst_user_item_feats_df.copy()
trn_user_item_feats_df_din_model = trn_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')
if offline:
val_user_item_feats_df_din_model = val_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')
else:
val_user_item_feats_df_din_model = None
tst_user_item_feats_df_din_model = tst_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')
DIN模型簡介
我們下面嘗試使用DIN模型, DIN的全稱是Deep Interest Network, 這是阿里2018年基于前面的深度學習模型無法表達用戶多樣化的興趣而提出的一個模型, 它可以通過考慮【給定的候選廣告】和【用戶的歷史行為】的相關性,來計算用戶興趣的表示向量。具體來說就是通過引入局部激活單元,通過軟搜索歷史行為的相關部分來關注相關的用戶興趣,并采用加權和來獲得有關候選廣告的用戶興趣的表示。與候選廣告相關性較高的行為會獲得較高的激活權重,并支配著用戶興趣。該表示向量在不同廣告上有所不同,大大提高了模型的表達能力。所以該模型對于此次新聞推薦的任務也比較適合, 我們在這里通過當前的候選文章與用戶歷史點擊文章的相關性來計算用戶對于文章的興趣。 該模型的結構如下:
我們這里直接調包來使用這個模型, 關于這個模型的詳細細節部分我們會在下一期的推薦系統組隊學習中給出。下面說一下該模型如何具體使用:deepctr的函數原型如下:
def DIN(dnn_feature_columns, history_feature_list, dnn_use_bn=False,
dnn_hidden_units=(200, 80), dnn_activation='relu', att_hidden_size=(80, 40), att_activation="dice",
att_weight_normalization=False, l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, seed=1024,
task='binary'):
- dnn_feature_columns: 特征列, 包含數據所有特征的列表
- history_feature_list: 用戶歷史行為列, 反應用戶歷史行為的特征的列表
- dnn_use_bn: 是否使用BatchNormalization
- dnn_hidden_units: 全連接層網絡的層數和每一層神經元的個數, 一個列表或者元組
- dnn_activation_relu: 全連接網絡的激活單元類型
- att_hidden_size: 注意力層的全連接網絡的層數和每一層神經元的個數
- att_activation: 注意力層的激活單元類型
- att_weight_normalization: 是否歸一化注意力得分
- l2_reg_dnn: 全連接網絡的正則化系數
- l2_reg_embedding: embedding向量的正則化稀疏
- dnn_dropout: 全連接網絡的神經元的失活概率
- task: 任務, 可以是分類, 也可是是回歸
在具體使用的時候, 我們必須要傳入特征列和歷史行為列, 但是再傳入之前, 我們需要進行一下特征列的預處理。具體如下:
- 首先,我們要處理數據集, 得到數據, 由于我們是基于用戶過去的行為去預測用戶是否點擊當前文章, 所以我們需要把數據的特征列劃分成數值型特征, 離散型特征和歷史行為特征列三部分, 對于每一部分, DIN模型的處理會有不同
- 對于離散型特征, 在我們的數據集中就是那些類別型的特征, 比如user_id這種, 這種類別型特征, 我們首先要經過embedding處理得到每個特征的低維稠密型表示, 既然要經過embedding, 那么我們就需要為每一列的類別特征的取值建立一個字典,并指明embedding維度, 所以在使用deepctr的DIN模型準備數據的時候, 我們需要通過SparseFeat函數指明這些類別型特征, 這個函數的傳入參數就是列名, 列的唯一取值(建立字典用)和embedding維度。
- 對于用戶歷史行為特征列, 比如文章id, 文章的類別等這種, 同樣的我們需要先經過embedding處理, 只不過和上面不一樣的地方是,對于這種特征, 我們在得到每個特征的embedding表示之后, 還需要通過一個Attention_layer計算用戶的歷史行為和當前候選文章的相關性以此得到當前用戶的embedding向量, 這個向量就可以基于當前的候選文章與用戶過去點擊過得歷史文章的相似性的程度來反應用戶的興趣, 并且隨著用戶的不同的歷史點擊來變化,去動態的模擬用戶興趣的變化過程。這類特征對于每個用戶都是一個歷史行為序列, 對于每個用戶, 歷史行為序列長度會不一樣, 可能有的用戶點擊的歷史文章多,有的點擊的歷史文章少, 所以我們還需要把這個長度統一起來, 在為DIN模型準備數據的時候, 我們首先要通過SparseFeat函數指明這些類別型特征, 然后還需要通過VarLenSparseFeat函數再進行序列填充, 使得每個用戶的歷史序列一樣長, 所以這個函數參數中會有個maxlen,來指明序列的最大長度是多少。
- 對于連續型特征列, 我們只需要用DenseFeat函數來指明列名和維度即可。
- 處理完特征列之后, 我們把相應的數據與列進行對應,就得到了最后的數據。
下面根據具體的代碼感受一下, 邏輯是這樣, 首先我們需要寫一個數據準備函數, 在這里面就是根據上面的具體步驟準備數據, 得到數據和特征列, 然后就是建立DIN模型并訓練, 最后基于模型進行測試。
# 導入deepctr
from deepctr.models import DIN
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import *
import tensorflow as tf
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
# 數據準備函數
def get_din_feats_columns(df, dense_fea, sparse_fea, behavior_fea, his_behavior_fea, emb_dim=32, max_len=100):
"""
數據準備函數:
df: 數據集
dense_fea: 數值型特征列
sparse_fea: 離散型特征列
behavior_fea: 用戶的候選行為特征列
his_behavior_fea: 用戶的歷史行為特征列
embedding_dim: embedding的維度, 這里為了簡單, 統一把離散型特征列采用一樣的隱向量維度
max_len: 用戶序列的最大長度
"""
sparse_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique() + 1, embedding_dim=emb_dim) for feat in sparse_fea]
dense_feature_columns = [DenseFeat(feat, 1, ) for feat in dense_fea]
var_feature_columns = [VarLenSparseFeat(SparseFeat(feat, vocabulary_size=df['click_article_id'].nunique() + 1,
embedding_dim=emb_dim, embedding_name='click_article_id'), maxlen=max_len) for feat in hist_behavior_fea]
dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns
# 建立x, x是一個字典的形式
x = {}
for name in get_feature_names(dnn_feature_columns):
if name in his_behavior_fea:
# 這是歷史行為序列
his_list = [l for l in df[name]]
x[name] = pad_sequences(his_list, maxlen=max_len, padding='post') # 二維數組
else:
x[name] = df[name].values
return x, dnn_feature_columns
# 把特征分開
sparse_fea = ['user_id', 'click_article_id', 'category_id', 'click_environment', 'click_deviceGroup',
'click_os', 'click_country', 'click_region', 'click_referrer_type', 'is_cat_hab']
behavior_fea = ['click_article_id']
hist_behavior_fea = ['hist_click_article_id']
dense_fea = ['sim0', 'time_diff0', 'word_diff0', 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score',
'rank','click_size','time_diff_mean','active_level','user_time_hob1','user_time_hob2',
'words_hbo','words_count']
# dense特征進行歸一化, 神經網絡訓練都需要將數值進行歸一化處理
mm = MinMaxScaler()
# 下面是做一些特殊處理,當在其他的地方出現無效值的時候,不處理無法進行歸一化,剛開始可以先把他注釋掉,在運行了下面的代碼
# 之后如果發現報錯,應該先去想辦法處理如何不出現inf之類的值
# trn_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)
# tst_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)
for feat in dense_fea:
trn_user_item_feats_df_din_model[feat] = mm.fit_transform(trn_user_item_feats_df_din_model[[feat]])
if val_user_item_feats_df_din_model is not None:
val_user_item_feats_df_din_model[feat] = mm.fit_transform(val_user_item_feats_df_din_model[[feat]])
tst_user_item_feats_df_din_model[feat] = mm.fit_transform(tst_user_item_feats_df_din_model[[feat]])
# 準備訓練數據
x_trn, dnn_feature_columns = get_din_feats_columns(trn_user_item_feats_df_din_model, dense_fea,
sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
y_trn = trn_user_item_feats_df_din_model['label'].values
if offline:
# 準備驗證數據
x_val, dnn_feature_columns = get_din_feats_columns(val_user_item_feats_df_din_model, dense_fea,
sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
y_val = val_user_item_feats_df_din_model['label'].values
dense_fea = [x for x in dense_fea if x != 'label']
x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea,
sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:143: calling RandomNormal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
# 建立模型
model = DIN(dnn_feature_columns, behavior_fea)
# 查看模型結構
model.summary()
# 模型編譯
model.compile('adam', 'binary_crossentropy',metrics=['binary_crossentropy', tf.keras.metrics.AUC()])
WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1288: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py:255: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
user_id (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
click_article_id (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
category_id (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
click_environment (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
click_deviceGroup (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
click_os (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
click_country (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
click_region (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
click_referrer_type (InputLayer [(None, 1)] 0
__________________________________________________________________________________________________
is_cat_hab (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
sparse_emb_user_id (Embedding) (None, 1, 32) 1600032 user_id[0][0]
__________________________________________________________________________________________________
sparse_seq_emb_hist_click_artic multiple 525664 click_article_id[0][0]
hist_click_article_id[0][0]
click_article_id[0][0]
__________________________________________________________________________________________________
sparse_emb_category_id (Embeddi (None, 1, 32) 7776 category_id[0][0]
__________________________________________________________________________________________________
sparse_emb_click_environment (E (None, 1, 32) 128 click_environment[0][0]
__________________________________________________________________________________________________
sparse_emb_click_deviceGroup (E (None, 1, 32) 160 click_deviceGroup[0][0]
__________________________________________________________________________________________________
sparse_emb_click_os (Embedding) (None, 1, 32) 288 click_os[0][0]
__________________________________________________________________________________________________
sparse_emb_click_country (Embed (None, 1, 32) 384 click_country[0][0]
__________________________________________________________________________________________________
sparse_emb_click_region (Embedd (None, 1, 32) 928 click_region[0][0]
__________________________________________________________________________________________________
sparse_emb_click_referrer_type (None, 1, 32) 256 click_referrer_type[0][0]
__________________________________________________________________________________________________
sparse_emb_is_cat_hab (Embeddin (None, 1, 32) 64 is_cat_hab[0][0]
__________________________________________________________________________________________________
no_mask (NoMask) (None, 1, 32) 0 sparse_emb_user_id[0][0]
sparse_seq_emb_hist_click_article
sparse_emb_category_id[0][0]
sparse_emb_click_environment[0][0
sparse_emb_click_deviceGroup[0][0
sparse_emb_click_os[0][0]
sparse_emb_click_country[0][0]
sparse_emb_click_region[0][0]
sparse_emb_click_referrer_type[0]
sparse_emb_is_cat_hab[0][0]
__________________________________________________________________________________________________
hist_click_article_id (InputLay [(None, 50)] 0
__________________________________________________________________________________________________
concatenate (Concatenate) (None, 1, 320) 0 no_mask[0][0]
no_mask[1][0]
no_mask[2][0]
no_mask[3][0]
no_mask[4][0]
no_mask[5][0]
no_mask[6][0]
no_mask[7][0]
no_mask[8][0]
no_mask[9][0]
__________________________________________________________________________________________________
no_mask_1 (NoMask) (None, 1, 320) 0 concatenate[0][0]
__________________________________________________________________________________________________
attention_sequence_pooling_laye (None, 1, 32) 13961 sparse_seq_emb_hist_click_article
sparse_seq_emb_hist_click_article
__________________________________________________________________________________________________
concatenate_1 (Concatenate) (None, 1, 352) 0 no_mask_1[0][0]
attention_sequence_pooling_layer[
__________________________________________________________________________________________________
sim0 (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
time_diff0 (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
word_diff0 (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
sim_max (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
sim_min (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
sim_sum (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
sim_mean (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
score (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
rank (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
click_size (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
time_diff_mean (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
active_level (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
user_time_hob1 (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
user_time_hob2 (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
words_hbo (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
words_count (InputLayer) [(None, 1)] 0
__________________________________________________________________________________________________
flatten (Flatten) (None, 352) 0 concatenate_1[0][0]
__________________________________________________________________________________________________
no_mask_3 (NoMask) (None, 1) 0 sim0[0][0]
time_diff0[0][0]
word_diff0[0][0]
sim_max[0][0]
sim_min[0][0]
sim_sum[0][0]
sim_mean[0][0]
score[0][0]
rank[0][0]
click_size[0][0]
time_diff_mean[0][0]
active_level[0][0]
user_time_hob1[0][0]
user_time_hob2[0][0]
words_hbo[0][0]
words_count[0][0]
__________________________________________________________________________________________________
no_mask_2 (NoMask) (None, 352) 0 flatten[0][0]
__________________________________________________________________________________________________
concatenate_2 (Concatenate) (None, 16) 0 no_mask_3[0][0]
no_mask_3[1][0]
no_mask_3[2][0]
no_mask_3[3][0]
no_mask_3[4][0]
no_mask_3[5][0]
no_mask_3[6][0]
no_mask_3[7][0]
no_mask_3[8][0]
no_mask_3[9][0]
no_mask_3[10][0]
no_mask_3[11][0]
no_mask_3[12][0]
no_mask_3[13][0]
no_mask_3[14][0]
no_mask_3[15][0]
__________________________________________________________________________________________________
flatten_1 (Flatten) (None, 352) 0 no_mask_2[0][0]
__________________________________________________________________________________________________
flatten_2 (Flatten) (None, 16) 0 concatenate_2[0][0]
__________________________________________________________________________________________________
no_mask_4 (NoMask) multiple 0 flatten_1[0][0]
flatten_2[0][0]
__________________________________________________________________________________________________
concatenate_3 (Concatenate) (None, 368) 0 no_mask_4[0][0]
no_mask_4[1][0]
__________________________________________________________________________________________________
dnn_1 (DNN) (None, 80) 89880 concatenate_3[0][0]
__________________________________________________________________________________________________
dense (Dense) (None, 1) 80 dnn_1[0][0]
__________________________________________________________________________________________________
prediction_layer (PredictionLay (None, 1) 1 dense[0][0]
==================================================================================================
Total params: 2,239,602
Trainable params: 2,239,362
Non-trainable params: 240
__________________________________________________________________________________________________
# 模型訓練
if offline:
history = model.fit(x_trn, y_trn, verbose=1, epochs=10, validation_data=(x_val, y_val) , batch_size=256)
else:
# 也可以使用上面的語句用自己采樣出來的驗證集
# history = model.fit(x_trn, y_trn, verbose=1, epochs=3, validation_split=0.3, batch_size=256)
history = model.fit(x_trn, y_trn, verbose=1, epochs=2, batch_size=256)
Epoch 1/2
290964/290964 [==============================] - 55s 189us/sample - loss: 0.4209 - binary_crossentropy: 0.4206 - auc: 0.7842
Epoch 2/2
290964/290964 [==============================] - 52s 178us/sample - loss: 0.3630 - binary_crossentropy: 0.3618 - auc: 0.8478
# 模型預測
tst_user_item_feats_df_din_model['pred_score'] = model.predict(x_tst, verbose=1, batch_size=256)
tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'din_rank_score.csv', index=False)
500000/500000 [==============================] - 20s 39us/sample
# 預測結果重新排序, 及生成提交結果
rank_results = tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']]
submit(rank_results, topk=5, model_name='din')
# 五折交叉驗證,這里的五折交叉是以用戶為目標進行五折劃分
# 這一部分與前面的單獨訓練和驗證是分開的
def get_kfold_users(trn_df, n=5):
user_ids = trn_df['user_id'].unique()
user_set = [user_ids[i::n] for i in range(n)]
return user_set
k_fold = 5
trn_df = trn_user_item_feats_df_din_model
user_set = get_kfold_users(trn_df, n=k_fold)
score_list = []
score_df = trn_df[['user_id', 'click_article_id', 'label']]
sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])
dense_fea = [x for x in dense_fea if x != 'label']
x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea,
sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
# 五折交叉驗證,并將中間結果保存用于staking
for n_fold, valid_user in enumerate(user_set):
train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user
valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]
# 準備訓練數據
x_trn, dnn_feature_columns = get_din_feats_columns(train_idx, dense_fea,
sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
y_trn = train_idx['label'].values
# 準備驗證數據
x_val, dnn_feature_columns = get_din_feats_columns(valid_idx, dense_fea,
sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)
y_val = valid_idx['label'].values
history = model.fit(x_trn, y_trn, verbose=1, epochs=2, validation_data=(x_val, y_val) , batch_size=256)
# 預測驗證集結果
valid_idx['pred_score'] = model.predict(x_val, verbose=1, batch_size=256)
valid_idx.sort_values(by=['user_id', 'pred_score'])
valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
# 將驗證集的預測結果放到一個列表中,后面進行拼接
score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])
# 如果是線上測試,需要計算每次交叉驗證的結果相加,最后求平均
if not offline:
sub_preds += model.predict(x_tst, verbose=1, batch_size=256)[:, 0]
score_df_ = pd.concat(score_list, axis=0)
score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])
# 保存訓練集交叉驗證產生的新特征
score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_din_cls_feats.csv', index=False)
# 測試集的預測結果,多次交叉驗證求平均,將預測的score和對應的rank特征保存,可以用于后面的staking,這里還可以構造其他更多的特征
tst_user_item_feats_df_din_model['pred_score'] = sub_preds / k_fold
tst_user_item_feats_df_din_model['pred_score'] = tst_user_item_feats_df_din_model['pred_score'].transform(lambda x: norm_sim(x))
tst_user_item_feats_df_din_model.sort_values(by=['user_id', 'pred_score'])
tst_user_item_feats_df_din_model['pred_rank'] = tst_user_item_feats_df_din_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
# 保存測試集交叉驗證的新特征
tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_din_cls_feats.csv', index=False)
模型融合
加權融合
# 讀取多個模型的排序結果文件
lgb_ranker = pd.read_csv(save_path + 'lgb_ranker_score.csv')
lgb_cls = pd.read_csv(save_path + 'lgb_cls_score.csv')
din_ranker = pd.read_csv(save_path + 'din_rank_score.csv')
# 這里也可以換成交叉驗證輸出的測試結果進行加權融合
rank_model = {'lgb_ranker': lgb_ranker,
'lgb_cls': lgb_cls,
'din_ranker': din_ranker}
def get_ensumble_predict_topk(rank_model, topk=5):
final_recall = rank_model['lgb_cls'].append(rank_model['din_ranker'])
rank_model['lgb_ranker']['pred_score'] = rank_model['lgb_ranker']['pred_score'].transform(lambda x: norm_sim(x))
final_recall = final_recall.append(rank_model['lgb_ranker'])
final_recall = final_recall.groupby(['user_id', 'click_article_id'])['pred_score'].sum().reset_index()
submit(final_recall, topk=topk, model_name='ensemble_fuse')
get_ensumble_predict_topk(rank_model)
Staking
# 讀取多個模型的交叉驗證生成的結果文件
# 訓練集
trn_lgb_ranker_feats = pd.read_csv(save_path + 'trn_lgb_ranker_feats.csv')
trn_lgb_cls_feats = pd.read_csv(save_path + 'trn_lgb_cls_feats.csv')
trn_din_cls_feats = pd.read_csv(save_path + 'trn_din_cls_feats.csv')
# 測試集
tst_lgb_ranker_feats = pd.read_csv(save_path + 'tst_lgb_ranker_feats.csv')
tst_lgb_cls_feats = pd.read_csv(save_path + 'tst_lgb_cls_feats.csv')
tst_din_cls_feats = pd.read_csv(save_path + 'tst_din_cls_feats.csv')
# 將多個模型輸出的特征進行拼接
finall_trn_ranker_feats = trn_lgb_ranker_feats[['user_id', 'click_article_id', 'label']]
finall_tst_ranker_feats = tst_lgb_ranker_feats[['user_id', 'click_article_id']]
for idx, trn_model in enumerate([trn_lgb_ranker_feats, trn_lgb_cls_feats, trn_din_cls_feats]):
for feat in [ 'pred_score', 'pred_rank']:
col_name = feat + '_' + str(idx)
finall_trn_ranker_feats[col_name] = trn_model[feat]
for idx, tst_model in enumerate([tst_lgb_ranker_feats, tst_lgb_cls_feats, tst_din_cls_feats]):
for feat in [ 'pred_score', 'pred_rank']:
col_name = feat + '_' + str(idx)
finall_tst_ranker_feats[col_name] = tst_model[feat]
# 定義一個邏輯回歸模型再次擬合交叉驗證產生的特征對測試集進行預測
# 這里需要注意的是,在做交叉驗證的時候可以構造多一些與輸出預測值相關的特征,來豐富這里簡單模型的特征
from sklearn.linear_model import LogisticRegression
feat_cols = ['pred_score_0', 'pred_rank_0', 'pred_score_1', 'pred_rank_1', 'pred_score_2', 'pred_rank_2']
trn_x = finall_trn_ranker_feats[feat_cols]
trn_y = finall_trn_ranker_feats['label']
tst_x = finall_tst_ranker_feats[feat_cols]
# 定義模型
lr = LogisticRegression()
# 模型訓練
lr.fit(trn_x, trn_y)
# 模型預測
finall_tst_ranker_feats['pred_score'] = lr.predict_proba(tst_x)[:, 1]
# 預測結果重新排序, 及生成提交結果
rank_results = finall_tst_ranker_feats[['user_id', 'click_article_id', 'pred_score']]
submit(rank_results, topk=5, model_name='ensumble_staking')
總結
本章主要學習了三個排序模型,包括LGB的Rank, LGB的Classifier還有深度學習的DIN模型, 當然,對于這三個模型的原理部分,我們并沒有給出詳細的介紹, 請大家課下自己探索原理,也歡迎大家把自己的探索與所學分享出來,我們一塊學習和進步。最后,我們進行了簡單的模型融合策略,包括簡單的加權和Stacking。
這一節先欠著,后續補上,立貼為證