匯入相關套件¶

In [1]:

Copied!





# Import built-in packages
from math import isnan
from functools import reduce

# Import 3-rd party packages
import sqlalchemy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from plotnine import *
# Import built-in packages
from math import isnan
from functools import reduce

# Import 3-rd party packages
import sqlalchemy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from plotnine import *

事前準備¶

In [2]:

Copied!





def print_info(info, width=61, fillchar='='):
    """
    印出格式化的資訊
    """
    temp_width = width - (width-len(info))//2
    print(info.rjust(temp_width, fillchar).ljust(width, fillchar))
def print_info(info, width=61, fillchar='='):
    """
    印出格式化的資訊
    """
    temp_width = width - (width-len(info))//2
    print(info.rjust(temp_width, fillchar).ljust(width, fillchar))

In [3]:

Copied!





def get_connector(user, host, database, password=None, port='5432', protocol='postgres'):
    """
    取得連線引擎，預設為連線至 PostgreSQL，埠號預設為 5432。
    """
    print_info("GETTING CONNECTOR START!")
    user_info = f'{user}:{password}' if password else user
    url = f'{protocol}://{user_info}@{host}:{port}/{database}'
    engine = sqlalchemy.create_engine(url, client_encoding='utf-8')
    print_info("DONE!")
    return engine
def get_connector(user, host, database, password=None, port='5432', protocol='postgres'):
    """
    取得連線引擎，預設為連線至 PostgreSQL，埠號預設為 5432。
    """
    print_info("GETTING CONNECTOR START!")
    user_info = f'{user}:{password}' if password else user
    url = f'{protocol}://{user_info}@{host}:{port}/{database}'
    engine = sqlalchemy.create_engine(url, client_encoding='utf-8')
    print_info("DONE!")
    return engine

In [4]:

Copied!





def get_tables(engine, table_names):
    """
    依照 `tables_names` 的順序，取得 tables，並依序儲存於 `list` 當中，回傳型態為 `list`，每個 element 為 `DataFrame`。
    """
    print_info("GETTING TABLES START!")
    rslt = []
    for tn in table_names:
        query = f'SELECT * FROM {tn}'
        exec(f'{tn} = pd.read_sql(query, engine)')
        # exec(f"{tn} = pd.read_csv('{tn}.csv', encoding='utf8')") # from current working directory
        print(
            f'{format(tn, "26s")} 總共有 {eval(f"{tn}.shape[0]"):9,} 筆資料和 {eval(f"{tn}.shape[1]")} 個欄位')
        exec(f'rslt.append({tn})')
    print_info("DONE!")
    return rslt
def get_tables(engine, table_names):
    """
    依照 `tables_names` 的順序，取得 tables，並依序儲存於 `list` 當中，回傳型態為 `list`，每個 element 為 `DataFrame`。
    """
    print_info("GETTING TABLES START!")
    rslt = []
    for tn in table_names:
        query = f'SELECT * FROM {tn}'
        exec(f'{tn} = pd.read_sql(query, engine)')
        # exec(f"{tn} = pd.read_csv('{tn}.csv', encoding='utf8')") # from current working directory
        print(
            f'{format(tn, "26s")} 總共有 {eval(f"{tn}.shape[0]"):9,} 筆資料和 {eval(f"{tn}.shape[1]")} 個欄位')
        exec(f'rslt.append({tn})')
    print_info("DONE!")
    return rslt

In [5]:

Copied!





def merge_tables(tables, table_names, how):
    """
    合併所有 tables，回傳型態為 `DataFrame`。
    """
    print_info("MERGING TABLES START!")
    # 分別處理 post_{shared, comment_created, liked, collected}_{train, test} 四個 tables
    # groupby 每篇文章，將前十小時的分享數、評論數、愛心數、收藏數加總起來
    for idx, (table, tn) in enumerate(zip(tables, table_names)):
        if len(tn.split('_'))==2: continue                  # for handling posts_{train, test} table
        col_name = f"{tn.split('_')[1]}_count"              # tn.split('_')[1] is either {shared, comment, liked, collected}
        mapper = {'count': col_name}
        exec(f"tables[{idx}] = table.groupby(['post_key'], as_index=False).sum().rename(columns=mapper)")
    # 將 tables 合併起來並回傳。
    total_df = reduce(lambda left, right: pd.merge(left, right, on=['post_key'], how=how), tables)
    print_info("DONE!")
    return total_df
def merge_tables(tables, table_names, how):
    """
    合併所有 tables，回傳型態為 `DataFrame`。
    """
    print_info("MERGING TABLES START!")
    # 分別處理 post_{shared, comment_created, liked, collected}_{train, test} 四個 tables
    # groupby 每篇文章，將前十小時的分享數、評論數、愛心數、收藏數加總起來
    for idx, (table, tn) in enumerate(zip(tables, table_names)):
        if len(tn.split('_'))==2: continue                  # for handling posts_{train, test} table
        col_name = f"{tn.split('_')[1]}_count"              # tn.split('_')[1] is either {shared, comment, liked, collected}
        mapper = {'count': col_name}
        exec(f"tables[{idx}] = table.groupby(['post_key'], as_index=False).sum().rename(columns=mapper)")
    # 將 tables 合併起來並回傳。
    total_df = reduce(lambda left, right: pd.merge(left, right, on=['post_key'], how=how), tables)
    print_info("DONE!")
    return total_df

In [6]:

Copied!





def preprocess_total_df(total_df):
    """
    預處理剛合併好的 total_df 以符合後續建模需求，回傳型態為 `DataFrame`。
    """
    print_info("PREPROCESSING TOTAL_DF START!")
    total_df.set_index('post_key', inplace=True)                                    # post_key 欄位設為索引
    total_df['created_at_hour'] = pd.to_datetime(total_df['created_at_hour'])       # 將 created_at_hour 欄位轉換成 datetime 型態
    total_df['weekday'] = total_df['created_at_hour'].dt.dayofweek                  # 擷取出發文的 weekday
    total_df['hour'] = total_df['created_at_hour'].dt.hour                          # 擷取出發文的 hour
    total_df.fillna(0, inplace=True)                                                # NaN 值補 0
    total_df['is_trending'] = 0+(total_df['like_count_36_hour']>=1000)              # 轉換成 is_trending 類別欄位
    total_df = total_df.drop(['created_at_hour'], axis=1)                           # drop 掉不必要的欄位
    # 將計次欄位轉換成 int 型態
    col_names = ['shared_count', 'comment_count', 'liked_count', 'collected_count']
    for cn in col_names:
        total_df[cn] = total_df[cn].astype(dtype='int')
    print_info("DONE!")
    return total_df
def preprocess_total_df(total_df):
    """
    預處理剛合併好的 total_df 以符合後續建模需求，回傳型態為 `DataFrame`。
    """
    print_info("PREPROCESSING TOTAL_DF START!")
    total_df.set_index('post_key', inplace=True)                                    # post_key 欄位設為索引
    total_df['created_at_hour'] = pd.to_datetime(total_df['created_at_hour'])       # 將 created_at_hour 欄位轉換成 datetime 型態
    total_df['weekday'] = total_df['created_at_hour'].dt.dayofweek                  # 擷取出發文的 weekday
    total_df['hour'] = total_df['created_at_hour'].dt.hour                          # 擷取出發文的 hour
    total_df.fillna(0, inplace=True)                                                # NaN 值補 0
    total_df['is_trending'] = 0+(total_df['like_count_36_hour']>=1000)              # 轉換成 is_trending 類別欄位
    total_df = total_df.drop(['created_at_hour'], axis=1)                           # drop 掉不必要的欄位
    # 將計次欄位轉換成 int 型態
    col_names = ['shared_count', 'comment_count', 'liked_count', 'collected_count']
    for cn in col_names:
        total_df[cn] = total_df[cn].astype(dtype='int')
    print_info("DONE!")
    return total_df

In [7]:

Copied!





# Get engine
engine = get_connector(
    user="candidate",
    password="dcard-data-intern-2020",
    host="35.187.144.113",
    database="intern_task"
)
# Get tables from db
table_names_train = ['posts_train', 'post_shared_train', 
                     'post_comment_created_train', 'post_liked_train', 'post_collected_train']
tables_train = get_tables(engine, table_names_train)
# Merge tables
total_df_train = merge_tables(tables_train, table_names_train, how='left')
# Preprocess total_df
total_df_train = preprocess_total_df(total_df_train)

engine.dispose()
# Get engine
engine = get_connector(
    user="candidate",
    password="dcard-data-intern-2020",
    host="35.187.144.113",
    database="intern_task"
)
# Get tables from db
table_names_train = ['posts_train', 'post_shared_train', 
                     'post_comment_created_train', 'post_liked_train', 'post_collected_train']
tables_train = get_tables(engine, table_names_train)
# Merge tables
total_df_train = merge_tables(tables_train, table_names_train, how='left')
# Preprocess total_df
total_df_train = preprocess_total_df(total_df_train)

engine.dispose()

===================GETTING CONNECTOR START!==================
============================DONE!============================
====================GETTING TABLES START!====================
posts_train                總共有   793,751 筆資料和 3 個欄位
post_shared_train          總共有   304,260 筆資料和 3 個欄位
post_comment_created_train 總共有 2,372,228 筆資料和 3 個欄位
post_liked_train           總共有 3,395,903 筆資料和 3 個欄位
post_collected_train       總共有 1,235,126 筆資料和 3 個欄位
============================DONE!============================
====================MERGING TABLES START!====================
============================DONE!============================
================PREPROCESSING TOTAL_DF START!================
============================DONE!============================

In [8]:

Copied!

cv_results = pd.read_csv('./outputs/cv_results.csv')
cv_results = pd.read_csv('./outputs/cv_results.csv')

EDA¶

In [9]:

Copied!

temp = total_df_train.drop(columns=['weekday', 'hour', 'is_trending'])
sns.heatmap(temp.corr(), cmap='YlGnBu')
temp = total_df_train.drop(columns=['weekday', 'hour', 'is_trending'])
sns.heatmap(temp.corr(), cmap='YlGnBu')

Out[9]:

<matplotlib.axes._subplots.AxesSubplot at 0x23755717ba8>

No description has been provided for this image

In [10]:

Copied!

mapper = dict(zip([0, 1, 2, 3, 4, 5, 6], ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']))
mapper = dict(zip([0, 1, 2, 3, 4, 5, 6], ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']))

In [11]:

Copied!





# 觀察不同時段下的發文數
num_articles_heatmap_df = total_df_train.groupby(['weekday', 'hour']).size().reset_index().rename(columns={0:'count'})
num_articles_heatmap_df = num_articles_heatmap_df.pivot(index='weekday', columns='hour', values='count')
num_articles_heatmap_df = num_articles_heatmap_df.rename(mapper=mapper, axis=0)
num_articles_heatmap_df = num_articles_heatmap_df.reindex(['Monday', 'Tuesday', 'Wednesday', 
                                                           'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.figure(figsize=(20, 5))
plt.title(f'Number of Articles by Day of Week / Hour of Day')
sns.heatmap(num_articles_heatmap_df, cmap='OrRd', cbar=False)
# 觀察不同時段下的發文數
num_articles_heatmap_df = total_df_train.groupby(['weekday', 'hour']).size().reset_index().rename(columns={0:'count'})
num_articles_heatmap_df = num_articles_heatmap_df.pivot(index='weekday', columns='hour', values='count')
num_articles_heatmap_df = num_articles_heatmap_df.rename(mapper=mapper, axis=0)
num_articles_heatmap_df = num_articles_heatmap_df.reindex(['Monday', 'Tuesday', 'Wednesday', 
                                                           'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.figure(figsize=(20, 5))
plt.title(f'Number of Articles by Day of Week / Hour of Day')
sns.heatmap(num_articles_heatmap_df, cmap='OrRd', cbar=False)

Out[11]:

<matplotlib.axes._subplots.AxesSubplot at 0x237490fb518>

In [12]:

Copied!





# 觀察不同時段下的熱門文章比例
num_pops_heatmap_df = total_df_train.groupby(['weekday', 'hour'])['is_trending'].sum().reset_index()
num_pops_heatmap_df = num_pops_heatmap_df.pivot(index='weekday', columns='hour', values='is_trending')
num_pops_heatmap_df = num_pops_heatmap_df.rename(mapper=mapper, axis=0)
num_pops_heatmap_df = num_pops_heatmap_df.reindex(['Monday', 'Tuesday', 'Wednesday', 
                                                   'Thursday', 'Friday', 'Saturday', 'Sunday'])
pct_pops_heatmap_df = num_pops_heatmap_df/num_articles_heatmap_df
plt.figure(figsize=(20, 5))
plt.title(f'Percentage of Popular Articles by Day of Week / Hour of Day ')
sns.heatmap(pct_pops_heatmap_df, cmap='Blues', cbar=False)
# 觀察不同時段下的熱門文章比例
num_pops_heatmap_df = total_df_train.groupby(['weekday', 'hour'])['is_trending'].sum().reset_index()
num_pops_heatmap_df = num_pops_heatmap_df.pivot(index='weekday', columns='hour', values='is_trending')
num_pops_heatmap_df = num_pops_heatmap_df.rename(mapper=mapper, axis=0)
num_pops_heatmap_df = num_pops_heatmap_df.reindex(['Monday', 'Tuesday', 'Wednesday', 
                                                   'Thursday', 'Friday', 'Saturday', 'Sunday'])
pct_pops_heatmap_df = num_pops_heatmap_df/num_articles_heatmap_df
plt.figure(figsize=(20, 5))
plt.title(f'Percentage of Popular Articles by Day of Week / Hour of Day ')
sns.heatmap(pct_pops_heatmap_df, cmap='Blues', cbar=False)

Out[12]:

<matplotlib.axes._subplots.AxesSubplot at 0x2375e9e5cc0>

In [13]:

Copied!





# 觀察不同時段下，前 10 小時愛心平均數
num_likes_10_heatmap_df = total_df_train.groupby(['weekday', 'hour'])['liked_count'].mean().reset_index()
num_likes_10_heatmap_df = num_likes_10_heatmap_df.pivot(index='weekday', columns='hour', values='liked_count')
num_likes_10_heatmap_df = num_likes_10_heatmap_df.rename(mapper=mapper, axis=0)
num_likes_10_heatmap_df = num_likes_10_heatmap_df.reindex(['Monday', 'Tuesday', 'Wednesday', 
                                                           'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.figure(figsize=(20, 5))
plt.title(f'Average Likes within 10 hours by Day of Week / Hour of Day')
sns.heatmap(num_likes_10_heatmap_df, cmap='Purples', cbar=False)
# 觀察不同時段下，前 10 小時愛心平均數
num_likes_10_heatmap_df = total_df_train.groupby(['weekday', 'hour'])['liked_count'].mean().reset_index()
num_likes_10_heatmap_df = num_likes_10_heatmap_df.pivot(index='weekday', columns='hour', values='liked_count')
num_likes_10_heatmap_df = num_likes_10_heatmap_df.rename(mapper=mapper, axis=0)
num_likes_10_heatmap_df = num_likes_10_heatmap_df.reindex(['Monday', 'Tuesday', 'Wednesday', 
                                                           'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.figure(figsize=(20, 5))
plt.title(f'Average Likes within 10 hours by Day of Week / Hour of Day')
sns.heatmap(num_likes_10_heatmap_df, cmap='Purples', cbar=False)

Out[13]:

<matplotlib.axes._subplots.AxesSubplot at 0x237625a2ef0>

In [14]:

Copied!





# 觀察不同時段下，前 36 小時愛心平均數
num_likes_36_heatmap_df = total_df_train.groupby(['weekday', 'hour'])['like_count_36_hour'].mean().reset_index()
num_likes_36_heatmap_df = num_likes_36_heatmap_df.pivot(index='weekday', columns='hour', values='like_count_36_hour')
num_likes_36_heatmap_df = num_likes_36_heatmap_df.rename(mapper=mapper, axis=0)
num_likes_36_heatmap_df = num_likes_36_heatmap_df.reindex(['Monday', 'Tuesday', 'Wednesday', 
                                                           'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.figure(figsize=(20, 5))
plt.title(f'Average Likes within 36 hours by Day of Week / Hour of Day ')
sns.heatmap(num_likes_36_heatmap_df, cmap='YlGn', cbar=False)
# 觀察不同時段下，前 36 小時愛心平均數
num_likes_36_heatmap_df = total_df_train.groupby(['weekday', 'hour'])['like_count_36_hour'].mean().reset_index()
num_likes_36_heatmap_df = num_likes_36_heatmap_df.pivot(index='weekday', columns='hour', values='like_count_36_hour')
num_likes_36_heatmap_df = num_likes_36_heatmap_df.rename(mapper=mapper, axis=0)
num_likes_36_heatmap_df = num_likes_36_heatmap_df.reindex(['Monday', 'Tuesday', 'Wednesday', 
                                                           'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.figure(figsize=(20, 5))
plt.title(f'Average Likes within 36 hours by Day of Week / Hour of Day ')
sns.heatmap(num_likes_36_heatmap_df, cmap='YlGn', cbar=False)

Out[14]:

<matplotlib.axes._subplots.AxesSubplot at 0x2376129c400>

Evaluation¶

In [15]:

Copied!





# 去除用不到的欄位資訊
cv_results = cv_results.drop(columns=[col for col in cv_results.columns if 'split' in col])
# 將欄位內容化簡
def transform(elem, mapper):
    if type(elem)==float and isnan(elem):
        return elem
    for sub_str in mapper:
        if sub_str in elem:
            return mapper[sub_str]
    return elem
# resampler
mapper = {
    'SMOTE': 'SMOTE',
    'NearMiss': 'NearMiss'
}
cv_results['param_resampler'] = cv_results['param_resampler'].apply(transform, args=(mapper,))
# classifier
mapper = {
    'AdaBoostClassifier': 'AdaBoostClassifier',
    'XGBClassifier': 'XGBClassifier',
    'GradientBoostingClassifier': 'GradientBoostingClassifier'
}
cv_results['param_classifier'] = cv_results['param_classifier'].apply(transform, args=(mapper,))
# classifier__base_estimator
mapper = {
    'max_depth=1': 'DecisionTreeClassifier(max_depth=1)',
    'max_depth=2': 'DecisionTreeClassifier(max_depth=2)',
    'max_depth=3': 'DecisionTreeClassifier(max_depth=3)'
}
cv_results['param_classifier__base_estimator'] = cv_results['param_classifier__base_estimator'].apply(transform, args=(mapper,))
# 去除用不到的欄位資訊
cv_results = cv_results.drop(columns=[col for col in cv_results.columns if 'split' in col])
# 將欄位內容化簡
def transform(elem, mapper):
    if type(elem)==float and isnan(elem):
        return elem
    for sub_str in mapper:
        if sub_str in elem:
            return mapper[sub_str]
    return elem
# resampler
mapper = {
    'SMOTE': 'SMOTE',
    'NearMiss': 'NearMiss'
}
cv_results['param_resampler'] = cv_results['param_resampler'].apply(transform, args=(mapper,))
# classifier
mapper = {
    'AdaBoostClassifier': 'AdaBoostClassifier',
    'XGBClassifier': 'XGBClassifier',
    'GradientBoostingClassifier': 'GradientBoostingClassifier'
}
cv_results['param_classifier'] = cv_results['param_classifier'].apply(transform, args=(mapper,))
# classifier__base_estimator
mapper = {
    'max_depth=1': 'DecisionTreeClassifier(max_depth=1)',
    'max_depth=2': 'DecisionTreeClassifier(max_depth=2)',
    'max_depth=3': 'DecisionTreeClassifier(max_depth=3)'
}
cv_results['param_classifier__base_estimator'] = cv_results['param_classifier__base_estimator'].apply(transform, args=(mapper,))

Resampler¶

In [16]:

Copied!





temp = cv_results.groupby(['param_resampler'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
temp.reset_index(inplace=True)

(ggplot(temp, aes(x='param_resampler', y='mean_test_f1_score', fill='param_resampler'))
 + geom_bar(stat="identity")
 + ylim(0,1)
 + scale_fill_brewer('qualitative', 2)
 + geom_text(aes(label='mean_test_f1_score'), format_string='{:.2f}')
 + ggtitle(f'Average F1 Score by Resampler')
 + labs(fill=f'Resampler')
 + xlab('Resampler')
 + ylab(f'Average F1 score'))
temp = cv_results.groupby(['param_resampler'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
temp.reset_index(inplace=True)

(ggplot(temp, aes(x='param_resampler', y='mean_test_f1_score', fill='param_resampler'))
 + geom_bar(stat="identity")
 + ylim(0,1)
 + scale_fill_brewer('qualitative', 2)
 + geom_text(aes(label='mean_test_f1_score'), format_string='{:.2f}')
 + ggtitle(f'Average F1 Score by Resampler')
 + labs(fill=f'Resampler')
 + xlab('Resampler')
 + ylab(f'Average F1 score'))

Out[16]:

<ggplot: (-9223371884558871573)>

In [17]:

Copied!





(ggplot(temp, aes(x='param_resampler', y='mean_test_recall', fill='param_resampler'))
 + geom_bar(stat="identity")
 + ylim(0,1)
 + scale_fill_brewer('qualitative', 2)
 + geom_text(aes(label='mean_test_recall'), format_string='{:.2f}')
 + ggtitle(f'Average Recall by Resampler')
 + labs(fill=f'Resampler')
 + xlab('Resampler')
 + ylab(f'Average Recall'))
(ggplot(temp, aes(x='param_resampler', y='mean_test_recall', fill='param_resampler'))
 + geom_bar(stat="identity")
 + ylim(0,1)
 + scale_fill_brewer('qualitative', 2)
 + geom_text(aes(label='mean_test_recall'), format_string='{:.2f}')
 + ggtitle(f'Average Recall by Resampler')
 + labs(fill=f'Resampler')
 + xlab('Resampler')
 + ylab(f'Average Recall'))

Out[17]:

<ggplot: (-9223371884558718762)>

In [18]:

Copied!





(ggplot(temp, aes(x='param_resampler', y='mean_test_precision', fill='param_resampler'))
 + geom_bar(stat="identity")
 + ylim(0,1)
 + scale_fill_brewer('qualitative', 2)
 + geom_text(aes(label='mean_test_precision'), format_string='{:.2f}')
 + ggtitle(f'Average Precision by Resampler')
 + labs(fill=f'Resampler')
 + xlab('Resampler')
 + ylab(f'Average Precision'))
(ggplot(temp, aes(x='param_resampler', y='mean_test_precision', fill='param_resampler'))
 + geom_bar(stat="identity")
 + ylim(0,1)
 + scale_fill_brewer('qualitative', 2)
 + geom_text(aes(label='mean_test_precision'), format_string='{:.2f}')
 + ggtitle(f'Average Precision by Resampler')
 + labs(fill=f'Resampler')
 + xlab('Resampler')
 + ylab(f'Average Precision'))

Out[18]:

<ggplot: (152294750826)>

In [19]:

Copied!





(ggplot(temp, aes(x='param_resampler', y='mean_test_balanced_accuracy', fill='param_resampler'))
 + geom_bar(stat="identity")
 + ylim(0,1)
 + scale_fill_brewer('qualitative', 2)
 + geom_text(aes(label='mean_test_balanced_accuracy'), format_string='{:.2f}')
 + ggtitle(f'Average Balanced Accuracy by Resampler')
 + labs(fill=f'Resampler')
 + xlab('Resampler')
 + ylab(f'Average Balanced Accuracy'))
(ggplot(temp, aes(x='param_resampler', y='mean_test_balanced_accuracy', fill='param_resampler'))
 + geom_bar(stat="identity")
 + ylim(0,1)
 + scale_fill_brewer('qualitative', 2)
 + geom_text(aes(label='mean_test_balanced_accuracy'), format_string='{:.2f}')
 + ggtitle(f'Average Balanced Accuracy by Resampler')
 + labs(fill=f'Resampler')
 + xlab('Resampler')
 + ylab(f'Average Balanced Accuracy'))

Out[19]:

<ggplot: (-9223371884547166951)>

Resampler + Classifier¶

In [20]:

Copied!





temp = cv_results.groupby(['param_resampler', 'param_classifier'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
temp.reset_index(inplace=True)

(ggplot(temp, aes(x='param_resampler', y='mean_test_f1_score', fill='param_classifier'))
 + geom_bar(position='dodge', stat="identity")
 + ylim(0,1)
 + geom_text(aes(label='mean_test_f1_score'), position=position_dodge(width=0.9), format_string='{:.2f}')
 + ggtitle(f'Average F1 Score by Resampler and Classifier')
 + labs(fill=f'Classifier')
 + xlab('Resampler')
 + ylab(f'Average F1 score'))
temp = cv_results.groupby(['param_resampler', 'param_classifier'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
temp.reset_index(inplace=True)

(ggplot(temp, aes(x='param_resampler', y='mean_test_f1_score', fill='param_classifier'))
 + geom_bar(position='dodge', stat="identity")
 + ylim(0,1)
 + geom_text(aes(label='mean_test_f1_score'), position=position_dodge(width=0.9), format_string='{:.2f}')
 + ggtitle(f'Average F1 Score by Resampler and Classifier')
 + labs(fill=f'Classifier')
 + xlab('Resampler')
 + ylab(f'Average F1 score'))

Out[20]:

<ggplot: (-9223371884550063342)>

Classifier¶

In [21]:

Copied!





temp = cv_results.groupby(['param_classifier'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
temp.reset_index(inplace=True)

(ggplot(temp, aes(x='param_classifier', y='mean_test_f1_score', fill='param_classifier'))
 + geom_bar(stat="identity")
 + ylim(0,1)
 + geom_text(aes(label='mean_test_f1_score'), format_string='{:.2f}')
 + ggtitle('Average F1 Score by Classifier')
 + labs(fill='Classifier')
 + xlab('Classifier')
 + ylab(f'Average F1 score'))
temp = cv_results.groupby(['param_classifier'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
temp.reset_index(inplace=True)

(ggplot(temp, aes(x='param_classifier', y='mean_test_f1_score', fill='param_classifier'))
 + geom_bar(stat="identity")
 + ylim(0,1)
 + geom_text(aes(label='mean_test_f1_score'), format_string='{:.2f}')
 + ggtitle('Average F1 Score by Classifier')
 + labs(fill='Classifier')
 + xlab('Classifier')
 + ylab(f'Average F1 score'))

Out[21]:

<ggplot: (-9223371884545924818)>

Classifier + n_estimator¶

In [22]:

Copied!





temp = cv_results.groupby(['param_classifier', 'param_classifier__n_estimators'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
temp.reset_index(inplace=True)

(ggplot(temp, aes(x='param_classifier__n_estimators', y='mean_test_f1_score', color='param_classifier'))
 + geom_line()
 + geom_point()
 + ylim(0,1)
 + ggtitle('Average F1 Score by Classifier and Number of Estimators')
 + labs(color='Classifier')
 + xlab('Number of Estimators')
 + ylab('Average F1 score'))
temp = cv_results.groupby(['param_classifier', 'param_classifier__n_estimators'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
temp.reset_index(inplace=True)

(ggplot(temp, aes(x='param_classifier__n_estimators', y='mean_test_f1_score', color='param_classifier'))
 + geom_line()
 + geom_point()
 + ylim(0,1)
 + ggtitle('Average F1 Score by Classifier and Number of Estimators')
 + labs(color='Classifier')
 + xlab('Number of Estimators')
 + ylab('Average F1 score'))

Out[22]:

<ggplot: (-9223371884546480871)>

`AdaBoostClassifier` + `max_depth`¶

In [23]:

Copied!

cv_results[cv_results['param_resampler']=='passthrough'].groupby(['param_classifier', 'param_classifier__base_estimator'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
cv_results[cv_results['param_resampler']=='passthrough'].groupby(['param_classifier', 'param_classifier__base_estimator'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()

Out[23]:

		mean_test_precision	mean_test_recall	mean_test_specificity	mean_test_f1_score	mean_test_balanced_accuracy
param_classifier	param_classifier__base_estimator
AdaBoostClassifier	DecisionTreeClassifier(max_depth=1)	0.738579	0.436288	0.996339	0.548524	0.716314
	DecisionTreeClassifier(max_depth=2)	0.759336	0.443006	0.996670	0.559510	0.719838
	DecisionTreeClassifier(max_depth=3)	0.755862	0.441223	0.996619	0.557159	0.718921

`GradientBoostingClassifier`, `XGBClassifier` + `learning_rate`¶

In [24]:

Copied!

cv_results[cv_results['param_resampler']=='passthrough'].groupby(['param_classifier', 'param_classifier__learning_rate'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()
cv_results[cv_results['param_resampler']=='passthrough'].groupby(['param_classifier', 'param_classifier__learning_rate'])['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy'].mean()

Out[24]:

		mean_test_precision	mean_test_recall	mean_test_specificity	mean_test_f1_score	mean_test_balanced_accuracy
param_classifier	param_classifier__learning_rate
GradientBoostingClassifier	0.025	0.790585	0.395179	0.997518	0.526909	0.696348
	0.050	0.780465	0.423388	0.997177	0.548966	0.710282
	0.100	0.778204	0.434859	0.997062	0.557939	0.715961
XGBClassifier	0.025	0.754734	0.404196	0.996884	0.526422	0.700540
	0.050	0.776283	0.406060	0.997226	0.533204	0.701643
	0.100	0.783911	0.419787	0.997256	0.546763	0.708522

Best Model¶

f1-score¶

In [25]:

Copied!

print(cv_results[cv_results['rank_test_f1_score']==1]['params'].values[0])
print(cv_results[cv_results['rank_test_f1_score']==1]['params'].values[0])

{'classifier': AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=2,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                                                         random_state=None,
                                                         splitter='best'),
                   learning_rate=1.0, n_estimators=100, random_state=None), 'classifier__base_estimator': DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=2, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best'), 'classifier__n_estimators': 100, 'resampler': 'passthrough'}

In [26]:

Copied!

temp = cv_results[cv_results['rank_test_f1_score']==1]
temp[['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy']]
temp = cv_results[cv_results['rank_test_f1_score']==1]
temp[['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy']]

Out[26]:

	mean_test_precision	mean_test_recall	mean_test_specificity	mean_test_f1_score	mean_test_balanced_accuracy
15	0.759668	0.44419	0.996667	0.560527	0.720429

balanced accuracy¶

In [27]:

Copied!

print(cv_results[cv_results['rank_test_balanced_accuracy']==1]['params'].values[0])
print(cv_results[cv_results['rank_test_balanced_accuracy']==1]['params'].values[0])

{'classifier': GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False), 'classifier__learning_rate': 0.025, 'classifier__n_estimators': 120, 'resampler': SMOTE(k_neighbors=5, n_jobs=None, random_state=None, sampling_strategy='auto')}

In [28]:

Copied!

temp = cv_results[cv_results['rank_test_balanced_accuracy']==1]
temp[['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy']]
temp = cv_results[cv_results['rank_test_balanced_accuracy']==1]
temp[['mean_test_precision', 'mean_test_recall', 'mean_test_specificity', 'mean_test_f1_score', 'mean_test_balanced_accuracy']]

Out[28]:

	mean_test_precision	mean_test_recall	mean_test_specificity	mean_test_f1_score	mean_test_balanced_accuracy
46	0.199325	0.958312	0.908746	0.330003	0.933529

Table of Contents