やみとものプログラミング日記 やみとものプログラミング日記
TOP 【Kaggleライブラリ】FeatureManager(最新コード)
【Kaggleライブラリ】FeatureManager(最新コード)

【Kaggleライブラリ】FeatureManager(最新コード)

Kaggle
作成日時: 2020年2月5日(水) 18時55分
更新日時: 2020年4月26日(日) 20時15分

FeatureManager(最新コード)

class FeatureManager:
    features = []
    target_name = None
    df_train = None
    df_test = None
    
    def __init__(self, df_train, df_test, target_name, def_use=False, print_arr=True):
        self.features = []
        self.target_name = target_name
        self.df_train = df_train
        self.df_test = df_test
        
        for col in df_train.columns:
            self.features.append({
                "name": col,
                "use":  False if col in [target_name, "Id"] else def_use,
                "type": df_train[col].dtype,
                "ope": "",
                "memo": ""
            })
            
        # info設定
        # 欠損値情報
        df_train_test = pd.concat([df_train.drop(target_name, axis=1), df_test])
        null_sum = df_train_test.isnull().sum()
        for f in self.features:
            if f["name"] == target_name:
                f["info"] = "TARGET"
                continue
                
            if null_sum[f["name"]] != 0:
                null_sum_ = null_sum[f['name']]
                nunique = df_train_test[f['name']].nunique()
                f["info"] = f"{null_sum_}row null"
            else:
                f["info"] = ""
                
        # 水準数情報追加
        for f in self.features:
            if f["name"] == target_name:
                nunique = df_train[target_name].nunique()
            else:
                nunique = df_train_test[f["name"]].nunique()
            f["info"] += f"({nunique} uni)"
            
        if print_arr:
            self.print_feature_array()
    
    def print_feature_array(self, feature_arr=None):
        if feature_arr is None:
            feature_arr = self.features
            
        print("features = [")
        for f in feature_arr:
            col_quote = f"\"{f['name']}\","
            use_str = str(f["use"])
            type_quote = f"\"{f['type']}\","
            if f["info"] != "":
#                 info_quote = f"\"\033[31m{f['info']}\033[0m\"," + (" " * (18 - len(f["info"])))
                info_quote = f"\"{f['info']}\"," + (" " * (18 - len(f["info"])))
            else:
                info_quote = '"",' + (" " * 18)
            print(f"    {{ \"name\": {col_quote: <17} \"use\": {use_str: <6},   \"type\": {type_quote: <11} \"info\": {info_quote} \"ope\": \"{f['ope']}\", \"memo\": \"{f['memo']}\" }},")
        print("]")
        
    def set_use(self, feature_name, print_arr=False):
        feature_names = []
        if not isinstance(feature_name, list):
            feature_names.append(feature_name)
        else:
            feature_names = feature_name
        for f_name in feature_names:
            for f in self.features:
                if f["name"] == f_name and f["name"] != self.target_name and f["name"] != "Id":
                    f["use"] = True
        if print_arr:
            self.print_feature_array()
        
    def set_unuse(self, feature_name, print_arr=False):
        for f in self.features:
            if f["name"] == feature_name:
                f["use"] = False
        if print_arr:
            self.print_feature_array()
            
    def set_memo(self):
        # ★実装しろ!!!
        pass
        
    def get_use_features(self):
        ret = []
        for f in self.features:
            if f["use"] and f["name"] not in ["Id", self.target_name]:
                ret.append(f["name"])
        return ret
    
    def print_use_features(self):
        ret = []
        for f in self.features:
            if f["use"] and f["name"] not in ["Id", self.target_name]:
                ret.append(f)
        self.print_feature_array(ret)
        
    def print_has_null_features(self):
        df_train_test = pd.concat([df_train.drop(self.target_name, axis=1), df_test])
        null_sum = df_train_test.isnull().sum()
        
        features = []
        for f in self.features:
            if f["name"] != self.target_name and null_sum[f["name"]] != 0:
                features.append(f)
        self.print_feature_array(features)
        
    def print_target(self):
        target = []
        for f in self.features:
            if f["name"] == self.target_name:
                target.append(f)
        self.print_feature_array(target)
    
    def get_use_cat_features(self):
        ret = []
        use_features = self.get_use_features()
        for f in use_features:
            for ff in self.features:
                if ff["name"] == f and ff["type"] == "object":
                    ret.append(f)
        return ret
        
    def get_numeric_features(self):
        ret = []
        for f in self.features:
            if f["name"] != self.target_name and f["name"] != "Id" and f["type"] in ["int64", "float64"]:
                ret.append(f["name"])
        return ret
    
    def get_cat_features(self):
        ret = []
        for f in self.features:
            if f["name"] != self.target_name and f["type"] == "object":
                ret.append(f["name"])
        return ret
    
    def get_target_feature(self):
        return self.target_name
                     
    def fillna_by_mean(self, feature_name):
        for f in self.features:                                                 
            if f["name"] == feature_name:
                f["ope"] = "fillna_by_mean->"
                    
    def fillna_by_mode(self, feature_name):
        for f in self.features:                                                 
            if f["name"] == feature_name:
                f["ope"] = "fillna_by_mode->"  
                
    def fillna_by_str(self, feature_name, str_):
        for f in self.features:
            if f["name"] == feature_name:
                f["ope"] = f"fillna_by_str({str_})->"
                
    def fillna_by_num(self, feature_name, num):
        for f in self.features:
            if f["name"] == feature_name:
                f["ope"] = f"fillna_by_num({num})->"
    
    def standardize(self, feature_name):
        for f in self.features:
            if f["name"] == feature_name:
                f["ope"] += "standardize->"
                
    def log(self, feature_name):
        for f in self.features:
            if f["name"] == feature_name:
                f["ope"] += "log->"
                
    def dummies(self, feature_names):
        for f in self.features:
            if f["name"] in feature_names:
                f["ope"] += "dummy->"
     
    def do_operation(self):
        from sklearn.preprocessing import StandardScaler
        import numpy as np
        import re
        
        use_features = self.get_use_features()
        
        df_train_test = pd.concat([df_train.drop(self.target_name, axis=1), self.df_test])
        df_train_test = df_train_test[use_features]
        
        # 操作を行う(特徴量に対して)
        for f in self.features:
            if f["name"] in use_features:
                opes = f["ope"].strip("->")
                opes = opes.split("->")
                for ope in opes:
                    result_str = re.match(r'^fillna_by_str\((.+)\)$', ope)
                    result_num = re.match(r'^fillna_by_num\((.+)\)$', ope)
                    
                    if ope == "fillna_by_mean":
                        df_train_test[f["name"]].fillna(df_train_test[f["name"]].mean(), inplace=True)
                    elif ope == "fillna_by_mode":
                        df_train_test[f["name"]].fillna(df_train_test[f["name"]].mode()[0], inplace=True)
                    elif result_str:
                        df_train_test[f["name"]].fillna(result_str.group(1), inplace=True)
                    elif result_num:
                        df_train_test[f["name"]].fillna(float(result_num.group(1)), inplace=True)
                    elif ope == "standardize":
                        scaler = StandardScaler()
                        df_train_test[f["name"]] = scaler.fit_transform(df_train_test[f["name"]].astype("float64").values.reshape(-1, 1))
                    elif ope == "log":
                        df_train_test[f["name"]] = np.log(df_train_test[f["name"]])
                    elif ope == "dummy":
                        df_train_test = pd.get_dummies(df_train_test, columns=[f["name"]])
        
        # 操作を行う(目的変数に対して)
        train_y = self.df_train[self.target_name]
        for f in self.features:
            if f["name"] == self.target_name:
                opes = f["ope"].strip("->")
                opes = opes.split("->")
                for ope in opes:
                    if ope == "standardize":
                        scaler = StandardScaler()
                        train_y = scaler.fit_transform(train_y.astype("float64").values.reshape(-1, 1))
                    elif ope == "log":
                        train_y = np.log(train_y)
        
        train_x = df_train_test[ : len(self.df_train)]
        test_x = df_train_test[len(self.df_train) : ]
                
        return train_x, train_y, test_x 


コメント(0)

まだコメントがありません。
もしよろしければ下のフォームからコメント下さい。


コメントする

もしよろしければコメント下さい。

ハンドルネーム:

内容:

最新記事


【英語】テスト駆動勉強法
【英語】テスト駆動勉強法
コサイン類似度はベクトルを正規化してから内積を取っている
コサイン類似度はベクトルを正規化してから内積を取っている
【ゼロから作るDeep Learning 2】MatMulノード解説
【ゼロから作るDeep Learning 2】MatMulノード解説
『Kaggle』カテゴリの記事