匯入相關套件¶
In [1]:
Copied!
# Import built-in packages
from math import isnan
from functools import reduce
# Import 3-rd party packages
import sqlalchemy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from plotnine import *
# Import built-in packages
from math import isnan
from functools import reduce
# Import 3-rd party packages
import sqlalchemy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from plotnine import *
事前準備¶
In [2]:
Copied!
def print_info(info, width=61, fillchar='='):
"""
印出格式化的資訊
"""
temp_width = width - (width-len(info))//2
print(info.rjust(temp_width, fillchar).ljust(width, fillchar))
def print_info(info, width=61, fillchar='='):
"""
印出格式化的資訊
"""
temp_width = width - (width-len(info))//2
print(info.rjust(temp_width, fillchar).ljust(width, fillchar))
In [3]:
Copied!
def get_connector(user, host, database, password=None, port='5432', protocol='postgres'):
"""
取得連線引擎,預設為連線至 PostgreSQL,埠號預設為 5432。
"""
print_info("GETTING CONNECTOR START!")
user_info = f'{user}:{password}' if password else user
url = f'{protocol}://{user_info}@{host}:{port}/{database}'
engine = sqlalchemy.create_engine(url, client_encoding='utf-8')
print_info("DONE!")
return engine
def get_connector(user, host, database, password=None, port='5432', protocol='postgres'):
"""
取得連線引擎,預設為連線至 PostgreSQL,埠號預設為 5432。
"""
print_info("GETTING CONNECTOR START!")
user_info = f'{user}:{password}' if password else user
url = f'{protocol}://{user_info}@{host}:{port}/{database}'
engine = sqlalchemy.create_engine(url, client_encoding='utf-8')
print_info("DONE!")
return engine
In [4]:
Copied!
def get_tables(engine, table_names):
"""
依照 `tables_names` 的順序,取得 tables,並依序儲存於 `list` 當中,回傳型態為 `list`,每個 element 為 `DataFrame`。
"""
print_info("GETTING TABLES START!")
rslt = []
for tn in table_names:
query = f'SELECT * FROM {tn}'
exec(f'{tn} = pd.read_sql(query, engine)')
# exec(f"{tn} = pd.read_csv('{tn}.csv', encoding='utf8')") # from current working directory
print(
f'{format(tn, "26s")} 總共有 {eval(f"{tn}.shape[0]"):9,} 筆資料和 {eval(f"{tn}.shape[1]")} 個欄位')
exec(f'rslt.append({tn})')
print_info("DONE!")
return rslt
def get_tables(engine, table_names):
"""
依照 `tables_names` 的順序,取得 tables,並依序儲存於 `list` 當中,回傳型態為 `list`,每個 element 為 `DataFrame`。
"""
print_info("GETTING TABLES START!")
rslt = []
for tn in table_names:
query = f'SELECT * FROM {tn}'
exec(f'{tn} = pd.read_sql(query, engine)')
# exec(f"{tn} = pd.read_csv('{tn}.csv', encoding='utf8')") # from current working directory
print(
f'{format(tn, "26s")} 總共有 {eval(f"{tn}.shape[0]"):9,} 筆資料和 {eval(f"{tn}.shape[1]")} 個欄位')
exec(f'rslt.append({tn})')
print_info("DONE!")
return rslt
In [5]:
Copied!
def merge_tables(tables, table_names, how):
"""
合併所有 tables,回傳型態為 `DataFrame`。
"""
print_info("MERGING TABLES START!")
# 分別處理 post_{shared, comment_created, liked, collected}_{train, test} 四個 tables
# groupby 每篇文章,將前十小時的分享數、評論數、愛心數、收藏數加總起來
for idx, (table, tn) in enumerate(zip(tables, table_names)):
if len(tn.split('_'))==2: continue # for handling posts_{train, test} table
col_name = f"{tn.split('_')[1]}_count" # tn.split('_')[1] is either {shared, comment, liked, collected}
mapper = {'count': col_name}
exec(f"tables[{idx}] = table.groupby(['post_key'], as_index=False).sum().rename(columns=mapper)")
# 將 tables 合併起來並回傳。
total_df = reduce(lambda left, right: pd.merge(left, right, on=['post_key'], how=how), tables)
print_info("DONE!")
return total_df
def merge_tables(tables, table_names, how):
"""
合併所有 tables,回傳型態為 `DataFrame`。
"""
print_info("MERGING TABLES START!")
# 分別處理 post_{shared, comment_created, liked, collected}_{train, test} 四個 tables
# groupby 每篇文章,將前十小時的分享數、評論數、愛心數、收藏數加總起來
for idx, (table, tn) in enumerate(zip(tables, table_names)):
if len(tn.split('_'))==2: continue # for handling posts_{train, test} table
col_name = f"{tn.split('_')[1]}_count" # tn.split('_')[1] is either {shared, comment, liked, collected}
mapper = {'count': col_name}
exec(f"tables[{idx}] = table.groupby(['post_key'], as_index=False).sum().rename(columns=mapper)")
# 將 tables 合併起來並回傳。
total_df = reduce(lambda left, right: pd.merge(left, right, on=['post_key'], how=how), tables)
print_info("DONE!")
return total_df
In [6]:
Copied!
def preprocess_total_df(total_df):
"""
預處理剛合併好的 total_df 以符合後續建模需求,回傳型態為 `DataFrame`。
"""
print_info("PREPROCESSING TOTAL_DF START!")
total_df.set_index('post_key', inplace=True) # post_key 欄位設為索引
total_df['created_at_hour'] = pd.to_datetime(total_df['created_at_hour']) # 將 created_at_hour 欄位轉換成 datetime 型態
total_df['weekday'] = total_df['created_at_hour'].dt.dayofweek # 擷取出發文的 weekday
total_df['hour'] = total_df['created_at_hour'].dt.hour # 擷取出發文的 hour
total_df.fillna(0, inplace=True) # NaN 值補 0
total_df['is_trending'] = 0+(total_df['like_count_36_hour']>=1000) # 轉換成 is_trending 類別欄位
total_df = total_df.drop(['created_at_hour'], axis=1) # drop 掉不必要的欄位
# 將計次欄位轉換成 int 型態
col_names = ['shared_count', 'comment_count', 'liked_count', 'collected_count']
for cn in col_names:
total_df[cn] = total_df[cn].astype(dtype='int')
print_info("DONE!")
return total_df
def preprocess_total_df(total_df):
"""
預處理剛合併好的 total_df 以符合後續建模需求,回傳型態為 `DataFrame`。
"""
print_info("PREPROCESSING TOTAL_DF START!")
total_df.set_index('post_key', inplace=True) # post_key 欄位設為索引
total_df['created_at_hour'] = pd.to_datetime(total_df['created_at_hour']) # 將 created_at_hour 欄位轉換成 datetime 型態
total_df['weekday'] = total_df['created_at_hour'].dt.dayofweek # 擷取出發文的 weekday
total_df['hour'] = total_df['created_at_hour'].dt.hour # 擷取出發文的 hour
total_df.fillna(0, inplace=True) # NaN 值補 0
total_df['is_trending'] = 0+(total_df['like_count_36_hour']>=1000) # 轉換成 is_trending 類別欄位
total_df = total_df.drop(['created_at_hour'], axis=1) # drop 掉不必要的欄位
# 將計次欄位轉換成 int 型態
col_names = ['shared_count', 'comment_count', 'liked_count', 'collected_count']
for cn in col_names:
total_df[cn] = total_df[cn].astype(dtype='int')
print_info("DONE!")
return total_df
In [7]:
Copied!
# Get engine
engine = get_connector(
user="candidate",
password="dcard-data-intern-2020",
host="35.187.144.113",
database="intern_task"
)
# Get tables from db
table_names_train = ['posts_train', 'post_shared_train',
'post_comment_created_train', 'post_liked_train', 'post_collected_train']
tables_train = get_tables(engine, table_names_train)
# Merge tables
total_df_train = merge_tables(tables_train, table_names_train, how='left')
# Preprocess total_df
total_df_train = preprocess_total_df(total_df_train)
engine.dispose()
# Get engine
engine = get_connector(
user="candidate",
password="dcard-data-intern-2020",
host="35.187.144.113",
database="intern_task"
)
# Get tables from db
table_names_train = ['posts_train', 'post_shared_train',
'post_comment_created_train', 'post_liked_train', 'post_collected_train']
tables_train = get_tables(engine, table_names_train)
# Merge tables
total_df_train = merge_tables(tables_train, table_names_train, how='left')
# Preprocess total_df
total_df_train = preprocess_total_df(total_df_train)
engine.dispose()
===================GETTING CONNECTOR START!================== ============================DONE!============================ ====================GETTING TABLES START!==================== posts_train 總共有 793,751 筆資料和 3 個欄位 post_shared_train 總共有 304,260 筆資料和 3 個欄位 post_comment_created_train 總共有 2,372,228 筆資料和 3 個欄位 post_liked_train 總共有 3,395,903 筆資料和 3 個欄位 post_collected_train 總共有 1,235,126 筆資料和 3 個欄位 ============================DONE!============================ ====================MERGING TABLES START!==================== ============================DONE!============================ ================PREPROCESSING TOTAL_DF START!================ ============================DONE!============================
In [8]:
Copied!
cv_results = pd.read_csv('./outputs/cv_results.csv')
cv_results = pd.read_csv('./outputs/cv_results.csv')
EDA¶
In [9]:
Copied!
temp = total_df_train.drop(columns=['weekday', 'hour', 'is_trending'])
sns.heatmap(temp.corr(), cmap='YlGnBu')
temp = total_df_train.drop(columns=['weekday', 'hour', 'is_trending'])
sns.heatmap(temp.corr(), cmap='YlGnBu')
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x23755717ba8>
In [10]:
Copied!
mapper = dict(zip([0, 1, 2, 3, 4, 5, 6], ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']))
mapper = dict(zip([0, 1, 2, 3, 4, 5, 6], ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']))
In [11]:
Copied!
# 觀察不同時段下的發文數
num_articles_heatmap_df = total_df_train.groupby(['weekday', 'hour']).size().reset_index().rename(columns={0:'count'})
num_articles_heatmap_df = num_articles_heatmap_df.pivot(index='weekday', columns='hour', values='count')
num_articles_heatmap_df = num_articles_heatmap_df.rename(mapper=mapper, axis=0)
num_articles_heatmap_df = num_articles_heatmap_df.reindex(['Monday', 'Tuesday', 'Wednesday',
'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.figure(figsize=(20, 5))
plt.title(f'Number of Articles by Day of Week / Hour of Day')
sns.heatmap(num_articles_heatmap_df, cmap='OrRd', cbar=False)
# 觀察不同時段下的發文數
num_articles_heatmap_df = total_df_train.groupby(['weekday', 'hour']).size().reset_index().rename(columns={0:'count'})
num_articles_heatmap_df = num_articles_heatmap_df.pivot(index='weekday', columns='hour', values='count')
num_articles_heatmap_df = num_articles_heatmap_df.rename(mapper=mapper, axis=0)
num_articles_heatmap_df = num_articles_heatmap_df.reindex(['Monday', 'Tuesday', 'Wednesday',
'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.figure(figsize=(20, 5))
plt.title(f'Number of Articles by Day of Week / Hour of Day')
sns.heatmap(num_articles_heatmap_df, cmap='OrRd', cbar=False)
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x237490fb518>
In [12]:
Copied!
# 觀察不同時段下的熱門文章比例
num_pops_heatmap_df = total_df_train.groupby(['weekday', 'hour'])['is_trending'].sum().reset_index()
num_pops_heatmap_df = num_pops_heatmap_df.pivot(index='weekday', columns='hour', values='is_trending')
num_pops_heatmap_df = num_pops_heatmap_df.rename(mapper=mapper, axis=0)
num_pops_heatmap_df = num_pops_heatmap_df.reindex(['Monday', 'Tuesday', 'Wednesday',
'Thursday', 'Friday', 'Saturday', 'Sunday'])
pct_pops_heatmap_df = num_pops_heatmap_df/num_articles_heatmap_df
plt.figure(figsize=(20, 5))
plt.title(f'Percentage of Popular Articles by Day of Week / Hour of Day ')
sns.heatmap(pct_pops_heatmap_df, cmap='Blues', cbar=False)
# 觀察不同時段下的熱門文章比例
num_pops_heatmap_df = total_df_train.groupby(['weekday', 'hour'])['is_trending'].sum().reset_index()
num_pops_heatmap_df = num_pops_heatmap_df.pivot(index='weekday', columns='hour', values='is_trending')
num_pops_heatmap_df = num_pops_heatmap_df.rename(mapper=mapper, axis=0)
num_pops_heatmap_df = num_pops_heatmap_df.reindex(['Monday', 'Tuesday', 'Wednesday',
'Thursday', 'Friday', 'Saturday', 'Sunday'])
pct_pops_heatmap_df = num_pops_heatmap_df/num_articles_heatmap_df
plt.figure(figsize=(20, 5))
plt.title(f'Percentage of Popular Articles by Day of Week / Hour of Day ')
sns.heatmap(pct_pops_heatmap_df, cmap='Blues', cbar=False)
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x2375e9e5cc0>
In [13]:
Copied!
# 觀察不同時段下,前 10 小時愛心平均數
num_likes_10_heatmap_df = total_df_train.groupby(['weekday', 'hour'])['liked_count'].mean().reset_index()
num_likes_10_heatmap_df = num_likes_10_heatmap_df.pivot(index='weekday', columns='hour', values='liked_count')
num_likes_10_heatmap_df = num_likes_10_heatmap_df.rename(mapper=mapper, axis=0)
num_likes_10_heatmap_df = num_likes_10_heatmap_df.reindex(['Monday', 'Tuesday', 'Wednesday',
'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.figure(figsize=(20, 5))
plt.title(f'Average Likes within 10 hours by Day of Week / Hour of Day')
sns.heatmap(num_likes_10_heatmap_df, cmap='Purples', cbar=False)
# 觀察不同時段下,前 10 小時愛心平均數
num_likes_10_heatmap_df = total_df_train.groupby(['weekday', 'hour'])['liked_count'].mean().reset_index()
num_likes_10_heatmap_df = num_likes_10_heatmap_df.pivot(index='weekday', columns='hour', values='liked_count')
num_likes_10_heatmap_df = num_likes_10_heatmap_df.rename(mapper=mapper, axis=0)
num_likes_10_heatmap_df = num_likes_10_heatmap_df.reindex(['Monday', 'Tuesday', 'Wednesday',
'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.figure(figsize=(20, 5))
plt.title(f'Average Likes within 10 hours by Day of Week / Hour of Day')
sns.heatmap(num_likes_10_heatmap_df, cmap='Purples', cbar=False)
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x237625a2ef0>
In [14]:
Copied!
# 觀察不同時段下,前 36 小時愛心平均數
num_likes_36_heatmap_df = total_df_train.groupby(['weekday', 'hour'])['like_count_36_hour'].mean().reset_index()
num_likes_36_heatmap_df = num_likes_36_heatmap_df.pivot(index='weekday', columns='hour', values='like_count_36_hour')
num_likes_36_heatmap_df = num_likes_36_heatmap_df.rename(mapper=mapper, axis=0)
num_likes_36_heatmap_df = num_likes_36_heatmap_df.reindex(['Monday', 'Tuesday', 'Wednesday',
'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.figure(figsize=(20, 5))
plt.title(f'Average Likes within 36 hours by Day of Week / Hour of Day ')
sns.heatmap(num_likes_36_heatmap_df, cmap='YlGn', cbar=False)
# 觀察不同時段下,前 36 小時愛心平均數
num_likes_36_heatmap_df = total_df_train.groupby(['weekday', 'hour'])['like_count_36_hour'].mean().reset_index()
num_likes_36_heatmap_df = num_likes_36_heatmap_df.pivot(index='weekday', columns='hour', values='like_count_36_hour')
num_likes_36_heatmap_df = num_likes_36_heatmap_df.rename(mapper=mapper, axis=0)
num_likes_36_heatmap_df = num_likes_36_heatmap_df.reindex(['Monday', 'Tuesday', 'Wednesday',
'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.figure(figsize=(20, 5))
plt.title(f'Average Likes within 36 hours by Day of Week / Hour of Day ')
sns.heatmap(num_likes_36_heatmap_df, cmap='YlGn', cbar=False)
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x2376129c400>
Evaluation¶
In [15]:
Copied!
# 去除用不到的欄位資訊
cv_results = cv_results.drop(columns=[col for col in cv_results.columns if 'split' in col])
# 將欄位內容化簡
def transform(elem, mapper):
if type(elem)==float and isnan(elem):
return elem
for sub_str in mapper:
if sub_str in elem:
return mapper[sub_str]
return elem
# resampler
mapper = {
'SMOTE': 'SMOTE',
'NearMiss': 'NearMiss'
}
cv_results['param_resampler'] = cv_results['param_resampler'].apply(transform, args=(mapper,))
# classifier
mapper = {
'AdaBoostClassifier': 'AdaBoostClassifier',
'XGBClassifier': 'XGBClassifier',
'GradientBoostingClassifier': 'GradientBoostingClassifier'
}
cv_results['param_classifier'] = cv_results['param_classifier'].apply(transform, args=(mapper,))
# classifier__base_estimator
mapper = {
'max_depth=1': 'DecisionTreeClassifier(max_depth=1)',
'max_depth=2': 'DecisionTreeClassifier(max_depth=2)',
'max_depth=3': 'DecisionTreeClassifier(max_depth=3)'
}
cv_results['param_classifier__base_estimator'] = cv_results['param_classifier__base_estimator'].apply(transform, args=(mapper,))
# 去除用不到的欄位資訊
cv_results = cv_results.drop(columns=[col for col in cv_results.columns if 'split' in col])
# 將欄位內容化簡
def transform(elem, mapper):
if type(elem)==float and isnan(elem):
return elem
for sub_str in mapper:
if sub_str in elem:
return mapper[sub_str]
return elem
# resampler
mapper = {
'SMOTE': 'SMOTE',
'NearMiss': 'NearMiss'
}
cv_results['param_resampler'] = cv_results['param_resampler'].apply(transform, args=(mapper,))
# classifier
mapper = {
'AdaBoostClassifier': 'AdaBoostClassifier',
'XGBClassifier': 'XGBClassifier',
'GradientBoostingClassifier': 'GradientBoostingClassifier'
}
cv_results['param_classifier'] = cv_results['param_classifier'].apply(transform, args=(mapper,))
# classifier__base_estimator
mapper = {
'max_depth=1': 'DecisionTreeClassifier(max_depth=1)',
'max_depth=2': 'DecisionTreeClassifier(max_depth=2)',
'max_depth=3': 'DecisionTreeClassifier(max_depth=3)'
}
cv_results['param_classifier__base_estimator'] = cv_results['param_classifier__base_estimator'].apply(transform, args=(mapper,))
Resampler¶
In [16]:
Copied!
temp = cv_results.groupby(['param_resampler'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
temp.reset_index(inplace=True)
(ggplot(temp, aes(x='param_resampler', y='mean_test_f1_score', fill='param_resampler'))
+ geom_bar(stat="identity")
+ ylim(0,1)
+ scale_fill_brewer('qualitative', 2)
+ geom_text(aes(label='mean_test_f1_score'), format_string='{:.2f}')
+ ggtitle(f'Average F1 Score by Resampler')
+ labs(fill=f'Resampler')
+ xlab('Resampler')
+ ylab(f'Average F1 score'))
temp = cv_results.groupby(['param_resampler'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
temp.reset_index(inplace=True)
(ggplot(temp, aes(x='param_resampler', y='mean_test_f1_score', fill='param_resampler'))
+ geom_bar(stat="identity")
+ ylim(0,1)
+ scale_fill_brewer('qualitative', 2)
+ geom_text(aes(label='mean_test_f1_score'), format_string='{:.2f}')
+ ggtitle(f'Average F1 Score by Resampler')
+ labs(fill=f'Resampler')
+ xlab('Resampler')
+ ylab(f'Average F1 score'))
Out[16]:
<ggplot: (-9223371884558871573)>
In [17]:
Copied!
(ggplot(temp, aes(x='param_resampler', y='mean_test_recall', fill='param_resampler'))
+ geom_bar(stat="identity")
+ ylim(0,1)
+ scale_fill_brewer('qualitative', 2)
+ geom_text(aes(label='mean_test_recall'), format_string='{:.2f}')
+ ggtitle(f'Average Recall by Resampler')
+ labs(fill=f'Resampler')
+ xlab('Resampler')
+ ylab(f'Average Recall'))
(ggplot(temp, aes(x='param_resampler', y='mean_test_recall', fill='param_resampler'))
+ geom_bar(stat="identity")
+ ylim(0,1)
+ scale_fill_brewer('qualitative', 2)
+ geom_text(aes(label='mean_test_recall'), format_string='{:.2f}')
+ ggtitle(f'Average Recall by Resampler')
+ labs(fill=f'Resampler')
+ xlab('Resampler')
+ ylab(f'Average Recall'))
Out[17]:
<ggplot: (-9223371884558718762)>
In [18]:
Copied!
(ggplot(temp, aes(x='param_resampler', y='mean_test_precision', fill='param_resampler'))
+ geom_bar(stat="identity")
+ ylim(0,1)
+ scale_fill_brewer('qualitative', 2)
+ geom_text(aes(label='mean_test_precision'), format_string='{:.2f}')
+ ggtitle(f'Average Precision by Resampler')
+ labs(fill=f'Resampler')
+ xlab('Resampler')
+ ylab(f'Average Precision'))
(ggplot(temp, aes(x='param_resampler', y='mean_test_precision', fill='param_resampler'))
+ geom_bar(stat="identity")
+ ylim(0,1)
+ scale_fill_brewer('qualitative', 2)
+ geom_text(aes(label='mean_test_precision'), format_string='{:.2f}')
+ ggtitle(f'Average Precision by Resampler')
+ labs(fill=f'Resampler')
+ xlab('Resampler')
+ ylab(f'Average Precision'))
Out[18]:
<ggplot: (152294750826)>
In [19]:
Copied!
(ggplot(temp, aes(x='param_resampler', y='mean_test_balanced_accuracy', fill='param_resampler'))
+ geom_bar(stat="identity")
+ ylim(0,1)
+ scale_fill_brewer('qualitative', 2)
+ geom_text(aes(label='mean_test_balanced_accuracy'), format_string='{:.2f}')
+ ggtitle(f'Average Balanced Accuracy by Resampler')
+ labs(fill=f'Resampler')
+ xlab('Resampler')
+ ylab(f'Average Balanced Accuracy'))
(ggplot(temp, aes(x='param_resampler', y='mean_test_balanced_accuracy', fill='param_resampler'))
+ geom_bar(stat="identity")
+ ylim(0,1)
+ scale_fill_brewer('qualitative', 2)
+ geom_text(aes(label='mean_test_balanced_accuracy'), format_string='{:.2f}')
+ ggtitle(f'Average Balanced Accuracy by Resampler')
+ labs(fill=f'Resampler')
+ xlab('Resampler')
+ ylab(f'Average Balanced Accuracy'))
Out[19]:
<ggplot: (-9223371884547166951)>
Resampler + Classifier¶
In [20]:
Copied!
temp = cv_results.groupby(['param_resampler', 'param_classifier'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
temp.reset_index(inplace=True)
(ggplot(temp, aes(x='param_resampler', y='mean_test_f1_score', fill='param_classifier'))
+ geom_bar(position='dodge', stat="identity")
+ ylim(0,1)
+ geom_text(aes(label='mean_test_f1_score'), position=position_dodge(width=0.9), format_string='{:.2f}')
+ ggtitle(f'Average F1 Score by Resampler and Classifier')
+ labs(fill=f'Classifier')
+ xlab('Resampler')
+ ylab(f'Average F1 score'))
temp = cv_results.groupby(['param_resampler', 'param_classifier'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
temp.reset_index(inplace=True)
(ggplot(temp, aes(x='param_resampler', y='mean_test_f1_score', fill='param_classifier'))
+ geom_bar(position='dodge', stat="identity")
+ ylim(0,1)
+ geom_text(aes(label='mean_test_f1_score'), position=position_dodge(width=0.9), format_string='{:.2f}')
+ ggtitle(f'Average F1 Score by Resampler and Classifier')
+ labs(fill=f'Classifier')
+ xlab('Resampler')
+ ylab(f'Average F1 score'))
Out[20]:
<ggplot: (-9223371884550063342)>
Classifier¶
In [21]:
Copied!
temp = cv_results.groupby(['param_classifier'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
temp.reset_index(inplace=True)
(ggplot(temp, aes(x='param_classifier', y='mean_test_f1_score', fill='param_classifier'))
+ geom_bar(stat="identity")
+ ylim(0,1)
+ geom_text(aes(label='mean_test_f1_score'), format_string='{:.2f}')
+ ggtitle('Average F1 Score by Classifier')
+ labs(fill='Classifier')
+ xlab('Classifier')
+ ylab(f'Average F1 score'))
temp = cv_results.groupby(['param_classifier'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
temp.reset_index(inplace=True)
(ggplot(temp, aes(x='param_classifier', y='mean_test_f1_score', fill='param_classifier'))
+ geom_bar(stat="identity")
+ ylim(0,1)
+ geom_text(aes(label='mean_test_f1_score'), format_string='{:.2f}')
+ ggtitle('Average F1 Score by Classifier')
+ labs(fill='Classifier')
+ xlab('Classifier')
+ ylab(f'Average F1 score'))
Out[21]:
<ggplot: (-9223371884545924818)>
Classifier + n_estimator¶
In [22]:
Copied!
temp = cv_results.groupby(['param_classifier', 'param_classifier__n_estimators'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
temp.reset_index(inplace=True)
(ggplot(temp, aes(x='param_classifier__n_estimators', y='mean_test_f1_score', color='param_classifier'))
+ geom_line()
+ geom_point()
+ ylim(0,1)
+ ggtitle('Average F1 Score by Classifier and Number of Estimators')
+ labs(color='Classifier')
+ xlab('Number of Estimators')
+ ylab('Average F1 score'))
temp = cv_results.groupby(['param_classifier', 'param_classifier__n_estimators'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
temp.reset_index(inplace=True)
(ggplot(temp, aes(x='param_classifier__n_estimators', y='mean_test_f1_score', color='param_classifier'))
+ geom_line()
+ geom_point()
+ ylim(0,1)
+ ggtitle('Average F1 Score by Classifier and Number of Estimators')
+ labs(color='Classifier')
+ xlab('Number of Estimators')
+ ylab('Average F1 score'))
Out[22]:
<ggplot: (-9223371884546480871)>
AdaBoostClassifier
+ max_depth
¶
In [23]:
Copied!
cv_results[cv_results['param_resampler']=='passthrough'].groupby(['param_classifier', 'param_classifier__base_estimator'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
cv_results[cv_results['param_resampler']=='passthrough'].groupby(['param_classifier', 'param_classifier__base_estimator'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
Out[23]:
mean_test_precision | mean_test_recall | mean_test_specificity | mean_test_f1_score | mean_test_balanced_accuracy | ||
---|---|---|---|---|---|---|
param_classifier | param_classifier__base_estimator | |||||
AdaBoostClassifier | DecisionTreeClassifier(max_depth=1) | 0.738579 | 0.436288 | 0.996339 | 0.548524 | 0.716314 |
DecisionTreeClassifier(max_depth=2) | 0.759336 | 0.443006 | 0.996670 | 0.559510 | 0.719838 | |
DecisionTreeClassifier(max_depth=3) | 0.755862 | 0.441223 | 0.996619 | 0.557159 | 0.718921 |
GradientBoostingClassifier
, XGBClassifier
+ learning_rate
¶
In [24]:
Copied!
cv_results[cv_results['param_resampler']=='passthrough'].groupby(['param_classifier', 'param_classifier__learning_rate'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
cv_results[cv_results['param_resampler']=='passthrough'].groupby(['param_classifier', 'param_classifier__learning_rate'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
Out[24]:
mean_test_precision | mean_test_recall | mean_test_specificity | mean_test_f1_score | mean_test_balanced_accuracy | ||
---|---|---|---|---|---|---|
param_classifier | param_classifier__learning_rate | |||||
GradientBoostingClassifier | 0.025 | 0.790585 | 0.395179 | 0.997518 | 0.526909 | 0.696348 |
0.050 | 0.780465 | 0.423388 | 0.997177 | 0.548966 | 0.710282 | |
0.100 | 0.778204 | 0.434859 | 0.997062 | 0.557939 | 0.715961 | |
XGBClassifier | 0.025 | 0.754734 | 0.404196 | 0.996884 | 0.526422 | 0.700540 |
0.050 | 0.776283 | 0.406060 | 0.997226 | 0.533204 | 0.701643 | |
0.100 | 0.783911 | 0.419787 | 0.997256 | 0.546763 | 0.708522 |
Best Model¶
f1-score¶
In [25]:
Copied!
print(cv_results[cv_results['rank_test_f1_score']==1]['params'].values[0])
print(cv_results[cv_results['rank_test_f1_score']==1]['params'].values[0])
{'classifier': AdaBoostClassifier(algorithm='SAMME.R', base_estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=2, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort='deprecated', random_state=None, splitter='best'), learning_rate=1.0, n_estimators=100, random_state=None), 'classifier__base_estimator': DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=2, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort='deprecated', random_state=None, splitter='best'), 'classifier__n_estimators': 100, 'resampler': 'passthrough'}
In [26]:
Copied!
temp = cv_results[cv_results['rank_test_f1_score']==1]
temp[['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy']]
temp = cv_results[cv_results['rank_test_f1_score']==1]
temp[['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy']]
Out[26]:
mean_test_precision | mean_test_recall | mean_test_specificity | mean_test_f1_score | mean_test_balanced_accuracy | |
---|---|---|---|---|---|
15 | 0.759668 | 0.44419 | 0.996667 | 0.560527 | 0.720429 |
balanced accuracy¶
In [27]:
Copied!
print(cv_results[cv_results['rank_test_balanced_accuracy']==1]['params'].values[0])
print(cv_results[cv_results['rank_test_balanced_accuracy']==1]['params'].values[0])
{'classifier': GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_iter_no_change=None, presort='deprecated', random_state=None, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False), 'classifier__learning_rate': 0.025, 'classifier__n_estimators': 120, 'resampler': SMOTE(k_neighbors=5, n_jobs=None, random_state=None, sampling_strategy='auto')}
In [28]:
Copied!
temp = cv_results[cv_results['rank_test_balanced_accuracy']==1]
temp[['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy']]
temp = cv_results[cv_results['rank_test_balanced_accuracy']==1]
temp[['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy']]
Out[28]:
mean_test_precision | mean_test_recall | mean_test_specificity | mean_test_f1_score | mean_test_balanced_accuracy | |
---|---|---|---|---|---|
46 | 0.199325 | 0.958312 | 0.908746 | 0.330003 | 0.933529 |