やみとものプログラミング日記 やみとものプログラミング日記
TOP 【Kaggleライブラリ】House Prices スコア0.16240 Ridgeモデル
【Kaggleライブラリ】House Prices スコア0.16240 Ridgeモデル

【Kaggleライブラリ】House Prices スコア0.16240 Ridgeモデル

Kaggle
作成日時: 2020年2月5日(水) 17時14分
更新日時: 2020年2月5日(水) 17時17分

相関の高い10変数を標準化し、目的変数にはlogを取って計算。

CV = 0.17352218817383172

Publicスコア0.16240

class FeatureManager:
    features = []
    target_name = None
    df_train = None
    df_test = None
    
    @classmethod
    def init_features(cls, df_train, df_test, target_name=None, def_use=False, print_arr=True):
        cls.target_name = target_name
        
        cls.df_train = df_train
        cls.df_test = df_test
        
        cls.features = []
        for col in df_train.columns:
            cls.features.append({
                "name": col,
                "use":  def_use,
                "type": df_train[col].dtype,
                "ope": "",
                "memo": "TARGET" if col == target_name else ""
            })
            
        # info設定
        # 欠損値情報
        df_train_test = pd.concat([df_train.drop(target_name, axis=1), df_test])
        null_sum = df_train_test.isnull().sum()
        for f in cls.features:
            if f["name"] == target_name:
                f["info"] = ""
                continue
            if null_sum[f["name"]] != 0:
                null_sum_ = null_sum[f['name']]
                nunique = df_train_test[f['name']].nunique()
                f["info"] = f"{null_sum_}row null({nunique})"
            else:
                f["info"] = ""
            
        if print_arr:
            cls.print_feature_array()
    
    @classmethod
    def print_feature_array(cls, feature_arr=None):
        if feature_arr is None:
            feature_arr = cls.features
            
        print("features = [")
        for f in feature_arr:
            col_quote = f"\"{f['name']}\","
            use_str = str(f["use"])
            type_quote = f"\"{f['type']}\","
            if f["info"] != "":
#                 info_quote = f"\"\033[31m{f['info']}\033[0m\"," + (" " * (18 - len(f["info"])))
                info_quote = f"\"{f['info']}\"," + (" " * (18 - len(f["info"])))
            else:
                info_quote = '"",' + (" " * 18)
            print(f"    {{ \"name\": {col_quote: <17} \"use\": {use_str: <6},   \"type\": {type_quote: <11} \"info\": {info_quote} \"ope\": \"{f['ope']}\", \"memo\": \"{f['memo']}\" }},")
        print("]")
        
    @classmethod
    def set_use(cls, feature_name, print_arr=False):
        feature_names = []
        if not isinstance(feature_name, list):
            feature_names.append(feature_name)
        else:
            feature_names = feature_name
        for f_name in feature_names:
            for f in cls.features:
                if f["name"] == f_name:
                    f["use"] = True
        if print_arr:
            cls.print_feature_array()
        
    @classmethod
    def set_unuse(cls, feature_name, print_arr=False):
        for f in cls.features:
            if f["name"] == feature_name:
                f["use"] = False
        if print_arr:
            cls.print_feature_array()
            
    @classmethod
    def set_memo(cls):
        # ★実装しろ!!!
        pass
        
    @classmethod
    def get_use_features(cls):
        ret = []
        for f in cls.features:
            if f["use"] and f["name"] not in ["Id", cls.target_name]:
                ret.append(f["name"])
        return ret
    
    @classmethod
    def print_use_features(cls):
        ret = []
        for f in cls.features:
            if f["use"] and f["name"] not in ["Id", cls.target_name]:
                ret.append(f)
        cls.print_feature_array(ret)
        
    @classmethod
    def print_target(cls):
        target = []
        for f in cls.features:
            if f["name"] == cls.target_name:
                target.append(f)
        cls.print_feature_array(target)
    
    @classmethod
    def get_use_cat_features(cls):
        ret = []
        use_features = cls.get_use_features()
        for f in use_features:
            for ff in cls.features:
                if ff["name"] == f and ff["type"] == "object":
                    ret.append(f)
        return ret
        
    @classmethod
    def get_numeric_features(cls):
        ret = []
        for f in cls.features:
            if f["name"] != cls.target_name and f["name"] != "Id" and f["type"] in ["int64", "float64"]:
                ret.append(f["name"])
        return ret
    
    @classmethod
    def get_target_feature(cls):
        return cls.target_name
                                                        
    @classmethod
    def fillna_by_mean(cls, feature_name):
        for f in cls.features:                                                 
            if f["name"] == feature_name:
                f["ope"] = "fillna_by_mean->"
                                                       
    @classmethod
    def fillna_by_mode(cls, feature_name):
        for f in cls.features:                                                 
            if f["name"] == feature_name:
                f["ope"] = "fillna_by_mode->"  
    
    @classmethod
    def standardize(cls, feature_name):
        for f in cls.features:
            if f["name"] == feature_name:
                f["ope"] += "standardize->"
                
    @classmethod
    def log(cls, feature_name):
        for f in cls.features:
            if f["name"] == feature_name:
                f["ope"] += "log->"
     
    @classmethod
    def do_operation(cls):
        from sklearn.preprocessing import StandardScaler
        import numpy as np
        
        use_features = cls.get_use_features()
        
        df_train_test = pd.concat([df_train.drop(cls.target_name, axis=1), cls.df_test])
        df_train_test = df_train_test[use_features]
        
        # 操作を行う(特徴量に対して)
        for f in cls.features:
            if f["name"] in use_features:
                opes = f["ope"].strip("->")
                opes = opes.split("->")
                for ope in opes:
                    if ope == "fillna_by_mean":
                        df_train_test[f["name"]].fillna(df_train_test[f["name"]].mean(), inplace=True)
                    elif ope == "fillna_by_mode":
                        df_train_test[f["name"]].fillna(df_train_test[f["name"]].mode()[0], inplace=True)
                    elif ope == "standardize":
                        scaler = StandardScaler()
                        df_train_test[f["name"]] = scaler.fit_transform(df_train_test[f["name"]].astype("float64").values.reshape(-1, 1))
                    elif ope == "log":
                        df_train_test[f["name"]] = np.log(df_train_test[f["name"]])
        
        # 操作を行う(目的変数に対して)
        train_y = cls.df_train[cls.target_name]
        for f in cls.features:
            if f["name"] == cls.target_name:
                opes = f["ope"].strip("->")
                opes = opes.split("->")
                for ope in opes:
                    if ope == "fillna_by_mean":
#                         df_train_test[f["name"]].fillna(df_train_test[f["name"]].mean(), inplace=True)
                        pass
                    elif ope == "fillna_by_mode":
#                         df_train_test[f["name"]].fillna(df_train_test[f["name"]].mode()[0], inplace=True)
                        pass
                    elif ope == "standardize":
                        scaler = StandardScaler()
                        train_y = scaler.fit_transform(train_y.astype("float64").values.reshape(-1, 1))
                    elif ope == "log":
                        train_y = np.log(train_y)
        
        train_x = df_train_test[ : len(cls.df_train)]
        test_x = df_train_test[len(cls.df_train) : ]
                
        return train_x, train_y, test_x 


import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
%matplotlib inline
pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 3000)

df_train = pd.read_csv("input/train.csv")
df_test = pd.read_csv("input/test.csv")

FeatureManager.init_features(df_train, df_test, target_name="SalePrice", def_use=False)


df_train.corr()["SalePrice"].sort_values(ascending=False).head(11)


FeatureManager.set_use([
    "OverallQual",
    "GrLivArea",
    "GarageCars",
    "GarageArea",
    "TotalBsmtSF",
    "1stFlrSF",
    "FullBath",
    "TotRmsAbvGrd",
    "YearBuilt",
    "YearRemodAdd"
])

FeatureManager.log("SalePrice")

FeatureManager.fillna_by_mean("TotalBsmtSF")
FeatureManager.fillna_by_mode("GarageCars")
FeatureManager.fillna_by_mean("GarageArea")

FeatureManager.standardize("OverallQual")
FeatureManager.standardize("GrLivArea")
FeatureManager.standardize("GarageCars")
FeatureManager.standardize("GarageArea")
FeatureManager.standardize("TotalBsmtSF")
FeatureManager.standardize("1stFlrSF")
FeatureManager.standardize("FullBath")
FeatureManager.standardize("TotRmsAbvGrd")
FeatureManager.standardize("YearBuilt")
FeatureManager.standardize("YearRemodAdd")

FeatureManager.print_use_features()
FeatureManager.print_target()


train_x, train_y, test_x = FeatureManager.do_operation()


import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge



kf = KFold(n_splits=4, shuffle=True, random_state=894)



rmses = []
rmse_logs = []
ans = np.zeros(len(test_x))



for tr_idx, va_idx in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    
#     res = xgb_reg(tr_x, va_x, tr_y, va_y, test_x)
    ridge = Ridge()
    ridge.fit(tr_x, tr_y)
    
    va_pred = ridge.predict(va_x)
    va_pred = np.where(va_pred < 0, np.log(df_train["SalePrice"].mean()), va_pred)
    rmse = np.sqrt(mean_squared_error(np.exp(va_y), np.exp(va_pred)))
    rmse_log = np.sqrt(mean_squared_error(va_y, va_pred))
    
    # テストデータに対するこのモデルの予測値
    pred = ridge.predict(test_x)
    pred = np.where(pred < 0, np.log(df_train["SalePrice"].mean()), pred)
    pred = np.exp(pred)
    
    rmses.append(rmse)
    rmse_logs.append(rmse_log)
    
    ans += pred
ans /= len(rmses)
CV = np.mean(rmse_logs)
print(f"CV = {CV}")


data = {
    "Id": df_test["Id"],
    "SalePrice": ans
}
df_ans = pd.DataFrame(data)
df_ans.head()
df_ans.to_csv("ridge_cv_0_173.csv", index=False)



コメント(0)

まだコメントがありません。
もしよろしければ下のフォームからコメント下さい。


コメントする

もしよろしければコメント下さい。

ハンドルネーム:

内容:

最新記事


【英語】テスト駆動勉強法
【英語】テスト駆動勉強法
コサイン類似度はベクトルを正規化してから内積を取っている
コサイン類似度はベクトルを正規化してから内積を取っている
【ゼロから作るDeep Learning 2】MatMulノード解説
【ゼロから作るDeep Learning 2】MatMulノード解説
『Kaggle』カテゴリの記事