やみとものプログラミング日記 やみとものプログラミング日記
TOP 【Kaggleライブラリ】House Pricesを線形モデルで解いてみた(FeatureManagerクラス)
【Kaggleライブラリ】House Pricesを線形モデルで解いてみた(FeatureManagerクラス)

【Kaggleライブラリ】House Pricesを線形モデルで解いてみた(FeatureManagerクラス)

Kaggle
作成日時: 2020年2月5日(水) 15時51分
更新日時: 2020年2月5日(水) 15時55分

FeatureManagerクラスだいぶ整備した。なんとなくコードが分かりやすい???

class FeatureManager:
    features = []
    target_name = None
    df_train = None
    df_test = None
    
    @classmethod
    def init_features(cls, df_train, df_test, target_name=None, def_use=False, print_arr=True):
        cls.target_name = target_name
        
        cls.df_train = df_train
        cls.df_test = df_test
        
        cls.features = []
        for col in df_train.columns:
            cls.features.append({
                "name": col,
                "use":  def_use,
                "type": df_train[col].dtype,
                "ope": "",
                "memo": "TARGET" if col == target_name else ""
            })
            
        # info設定
        # 欠損値情報
        df_train_test = pd.concat([df_train.drop(target_name, axis=1), df_test])
        null_sum = df_train_test.isnull().sum()
        for f in cls.features:
            if f["name"] == target_name:
                f["info"] = ""
                continue
            if null_sum[f["name"]] != 0:
                null_sum_ = null_sum[f['name']]
                nunique = df_train_test[f['name']].nunique()
                f["info"] = f"{null_sum_}row null({nunique})"
            else:
                f["info"] = ""
            
        if print_arr:
            cls.print_feature_array()
    
    @classmethod
    def print_feature_array(cls, feature_arr=None):
        if feature_arr is None:
            feature_arr = cls.features
            
        print("features = [")
        for f in feature_arr:
            col_quote = f"\"{f['name']}\","
            use_str = str(f["use"])
            type_quote = f"\"{f['type']}\","
            if f["info"] != "":
#                 info_quote = f"\"\033[31m{f['info']}\033[0m\"," + (" " * (18 - len(f["info"])))
                info_quote = f"\"{f['info']}\"," + (" " * (18 - len(f["info"])))
            else:
                info_quote = '"",' + (" " * 18)
            print(f"    {{ \"name\": {col_quote: <17} \"use\": {use_str: <6},   \"type\": {type_quote: <11} \"info\": {info_quote} \"ope\": \"{f['ope']}\", \"memo\": \"{f['memo']}\" }},")
        print("]")
        
    @classmethod
    def set_use(cls, feature_name, print_arr=False):
        feature_names = []
        if not isinstance(feature_name, list):
            feature_names.append(feature_name)
        else:
            feature_names = feature_name
        for f_name in feature_names:
            for f in cls.features:
                if f["name"] == f_name:
                    f["use"] = True
        if print_arr:
            cls.print_feature_array()
        
    @classmethod
    def set_unuse(cls, feature_name, print_arr=False):
        for f in cls.features:
            if f["name"] == feature_name:
                f["use"] = False
        if print_arr:
            cls.print_feature_array()
            
    @classmethod
    def set_memo(cls):
        # ★実装しろ!!!
        pass
        
    @classmethod
    def get_use_features(cls):
        ret = []
        for f in cls.features:
            if f["use"] and f["name"] not in ["Id", cls.target_name]:
                ret.append(f["name"])
        return ret
    
    @classmethod
    def print_use_features(cls):
        ret = []
        for f in cls.features:
            if f["use"] and f["name"] not in ["Id", cls.target_name]:
                ret.append(f)
        cls.print_feature_array(ret)
    
    @classmethod
    def get_use_cat_features(cls):
        ret = []
        use_features = cls.get_use_features()
        for f in use_features:
            for ff in cls.features:
                if ff["name"] == f and ff["type"] == "object":
                    ret.append(f)
        return ret
        
    @classmethod
    def get_numeric_features(cls):
        ret = []
        for f in cls.features:
            if f["name"] != cls.target_name and f["name"] != "Id" and f["type"] in ["int64", "float64"]:
                ret.append(f["name"])
        return ret
    
    @classmethod
    def get_target_feature(cls):
        return cls.target_name
                                                        
    @classmethod
    def fillna_by_mean(cls, feature_name):
        for f in cls.features:                                                 
            if f["name"] == feature_name:
                f["ope"] = "fillna_by_mean"
                                                       
    @classmethod
    def fillna_by_mode(cls, feature_name):
        for f in cls.features:                                                 
            if f["name"] == feature_name:
                f["ope"] = "fillna_by_mode"           
     
    @classmethod
    def do_operation(cls):
        use_features = cls.get_use_features()
        
        df_train_test = pd.concat([df_train.drop(cls.target_name, axis=1), cls.df_test])
        df_train_test = df_train_test[use_features]
        
        for f in cls.features:
            if f["name"] in use_features:
                opes = f["ope"].split("->")
                for ope in opes:
                    if ope == "fillna_by_mean":
                        df_train_test[f["name"]].fillna(df_train_test[f["name"]].mean(), inplace=True)
                    elif ope == "fillna_by_mode":
                        df_train_test[f["name"]].fillna(df_train_test[f["name"]].mode()[0], inplace=True)
        
        train_x = df_train_test[ : len(cls.df_train)]
        train_y = cls.df_train[cls.target_name]
        test_x = df_train_test[len(cls.df_train) : ]
                
        return train_x, train_y, test_x 


import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
%matplotlib inline
pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 3000)

df_train = pd.read_csv("input/train.csv")
df_test = pd.read_csv("input/test.csv")

FeatureManager.init_features(df_train, df_test, target_name="SalePrice", def_use=False)


df_train.corr()["SalePrice"].sort_values(ascending=False).head(11)


FeatureManager.set_use([
    "OverallQual",
    "GrLivArea",
    "GarageCars",
    "GarageArea",
    "TotalBsmtSF",
    "1stFlrSF"
])
FeatureManager.print_use_features()
FeatureManager.fillna_by_mean("TotalBsmtSF")
FeatureManager.fillna_by_mode("GarageCars")
FeatureManager.fillna_by_mean("GarageArea")
FeatureManager.print_use_features()


train_x, train_y, test_x = FeatureManager.do_operation()


import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

kf = KFold(n_splits=4, shuffle=True, random_state=894)

rmses = []
rmse_logs = []
ans = np.zeros(len(test_x))

for tr_idx, va_idx in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    
#     res = xgb_reg(tr_x, va_x, tr_y, va_y, test_x)
    ridge = Ridge()
    ridge.fit(tr_x, tr_y)
    
    va_pred = ridge.predict(va_x)
    va_pred = np.where(va_pred < 0, 1, va_pred)
    rmse = np.sqrt(mean_squared_error(va_y, va_pred))
    rmse_log = np.sqrt(mean_squared_error(np.log(va_y), np.log(va_pred)))
    
    # テストデータに対するこのモデルの予測値
    pred = ridge.predict(test_x)
    pred = np.where(pred < 0, 1, pred)
    
    rmses.append(rmse)
    rmse_logs.append(rmse_log)
    
    ans += pred

ans /= len(rmses)


CV = np.mean(rmse_logs)
print(f"CV = {CV}")

CV = 0.5191353106756289

data = {
    "Id": df_test["Id"],
    "SalePrice": ans
}
df_ans = pd.DataFrame(data)
df_ans.head()
df_ans.to_csv("ridge_cv_0_519.csv", index=False)



コメント(0)

まだコメントがありません。
もしよろしければ下のフォームからコメント下さい。


コメントする

もしよろしければコメント下さい。

ハンドルネーム:

内容:

最新記事


【英語】テスト駆動勉強法
【英語】テスト駆動勉強法
コサイン類似度はベクトルを正規化してから内積を取っている
コサイン類似度はベクトルを正規化してから内積を取っている
【ゼロから作るDeep Learning 2】MatMulノード解説
【ゼロから作るDeep Learning 2】MatMulノード解説
『Kaggle』カテゴリの記事