# -*- coding: utf-8 -*-
"""
This File contains the FeatureSelector class. To run this script uncomment or add the following lines in the
[options.entry_points] section in setup.cfg:
console_scripts =
feature_selector = feature_selector.__main__:main
Then run `python setup.py install` which will install the command `feature_selector`
inside your current environment.
"""
import argparse
import sys
import logging
import numpy as np
from sklearn.feature_selection import SelectPercentile
from preprocessor.preprocessor import Preprocessor
from itertools import zip_longest
from joblib import dump, load
__author__ = "Harvey Bastidas"
__copyright__ = "Harvey Bastidas"
__license__ = "mit"
_logger = logging.getLogger(__name__)
[docs]def score_func_regression(X,Y):
""" Used to score the features for feature selection, for regression. To be used in the fFeatureSeclector.feature_selection() method. """
import sklearn
return sklearn.feature_selection.mutual_info_regression(X,Y)
[docs]def score_func_classification(X,Y):
""" Used to score the features for feature selection, for regression. To be used in the fFeatureSeclector.feature_selection() method. """
import sklearn
return sklearn.feature_selection.mutual_info_classif(X,Y)
[docs]class FeatureSelector(Preprocessor):
""" The FeatureSelector preprocessor class """
def __init__(self, conf):
""" Constructor using same parameters as base class """
super().__init__(conf)
[docs] def parse_args(self, args):
""" Parse command line parameters
Args:
args ([str]): command line parameters as list of strings
Returns:
:obj:`argparse.Namespace`: command line parameters namespace
"""
_logger.debug("Parsing command-line arguments.")
parser = argparse.ArgumentParser(description="Dataset FeatureSelector: select the best scoring features according to a mutual information scoring algorithm.")
parser.add_argument("--training_file", help="filename of the training dataset")
parser.add_argument("--percentile", help="percentile of features to keep", type=int, default=10)
parser.add_argument("--classification", action="store_true", default=False, help="Uses a classification training signal instead of regression that is the default if this parameter is not set.")
parser.add_argument("--no_config", action="store_true", default=False, help="Do not generate an output configuration file.")
parser = self.parse_cmd(parser)
pargs = parser.parse_args(args)
self.assign_arguments(pargs)
if hasattr(pargs, "no_config"):
self.no_config = pargs.no_config
else:
self.no_config = False
if hasattr(pargs, "percentile"):
self.percentile = pargs.percentile
else:
self.percentile = 20
if hasattr(pargs, "classification"):
self.classification = True
else:
self.classification = False
if hasattr(pargs, "training_file"):
self.training_file = pargs.training_file
else:
print("Error: No training file parameter provided. Use option -h to show help.")
sys.exit()
[docs] def core(self):
""" Core preprocessor task after starting the instance with the main method.
Decide from the arguments, what method to call.
Args:
args (obj): command line parameters as objects
"""
_logger.debug("Performing core module task.")
if hasattr(self, "input_config_file"):
if self.input_config_file != None:
_logger.debug("Loading configuration file.")
self.load_from_config()
else:
_logger.debug("Performing feature_selection() method.")
self.feature_selection()
_logger.debug("End feature_selection() method.")
else:
_logger.debug("Performing feature_selection() method..")
self.feature_selection()
_logger.debug("End feature_selection() method..")
[docs] def feature_selection(self):
""" Process the dataset. """
# loads the training file
self.training_ds = np.genfromtxt(self.training_file, delimiter=",")
# Initialize feature selector
if not(hasattr(self, "classification")):
self.classification = False
if not(hasattr(self, "percentile")):
self.percentile = 20
if self.classification:
featureSelector = SelectPercentile(score_func = score_func_classification, percentile = self.percentile)
else:
featureSelector = SelectPercentile(score_func= score_func_regression, percentile = self.percentile)
# fit feature selector using the training signal
featureSelector.fit(self.input_ds, self.training_ds)
# applies feature selection mask to the input dataset
mask = featureSelector.get_support()
self.output_ds = self.input_ds[:, mask]
# saves configuration file
if not(hasattr(self, "no_config")):
self.no_config = False
if not(self.no_config):
dump(mask, self.output_config_file)
[docs] def load_from_config(self):
""" Process the dataset from a config file. """
_logger.debug("Loading configuration from input_config_file = "+ self.input_config_file)
mask =load(self.input_config_file)
self.output_ds = self.input_ds[:, mask]
[docs] def store(self):
""" Save preprocessed data and the configuration of the preprocessor. """
_logger.debug("Storing output_file = "+ self.output_file)
np.savetxt(self.output_file, self.output_ds, delimiter=",", fmt='%1.6f')
[docs]def run(args):
""" Entry point for console_scripts """
feature_selector = FeatureSelector(None)
feature_selector.main(args)
if __name__ == "__main__":
run(sys.argv)