Source code for corels.utils

from __future__ import print_function, division, with_statement
import numpy as np
import pickle

def check_array(x, ndim=None):
    if not hasattr(x, 'shape') and \
       (type(x) == str or not hasattr(x, '__len__')) and \
       not hasattr(x, '__array__'):
       raise TypeError("Array must be provided, got: " + str(type(x)))

    x = np.array(x, order='C', copy=False)

    if ndim and ndim != x.ndim:
        raise ValueError("Array must be " + str(ndim) + "-dimensional in shape, got " + str(x.ndim) +
                         " dimensions instead")

    asbool = x.astype(np.bool)

    if not np.array_equal(x, asbool):
        raise ValueError("Array must contain only binary members (0 or 1), got " + str(x));

    return asbool

def check_consistent_length(x, y):
    if x.ndim < 1 or y.ndim < 1:
        raise ValueError("Arrays must have at least one dimension")

    return x.shape[0] == y.shape[0]

def check_is_fitted(o, n):
    if not hasattr(o, n) or not getattr(o, n):
        raise ValueError("Model not fitted yet")

def get_feature(features, i):
    if not features or abs(i) > len(features):
        return ""

    if i < 0:
        return "not " + features[-i - 1]
    else:
        return features[i - 1]
    
def check_in(name, allowed, val):
    if not val.lower() in allowed:
        allowed_str = "'" + "' '".join(allowed) + "'"
        raise ValueError(name + " must be chosen from " + allowed_str +
                         ", got: " + val)

def check_features(features):
    if not isinstance(features, list):
        raise TypeError("Features must be a list, got: " + str(type(features)))
    
    for i in range(len(features)):
        if not isinstance(features[i], str):
            raise TypeError("Each feature much be a string, got: " + str(type(features[i])))

def check_rulelist(rl):
    if not hasattr(rl, "rules") or not hasattr(rl, "features") or not hasattr(rl, "prediction_name"):
        raise ValueError("Rulelist must have the following attributes: 'rules', 'features', 'prediction_name'")

    if not isinstance(rl.rules, list):
        raise TypeError("Rulelist rules must be a list, got: " + str(type(rl.rules)))
    
    if not isinstance(rl.prediction_name, str):
        raise TypeError("Prediction name must be a string, got: " + str(type(rl.prediction_name)))

    check_features(rl.features)
    n_features = len(rl.features)

    if len(rl.rules) < 1:
        raise ValueError("Rulelist must contain at least the default rule")

    for r in range(len(rl.rules)):
        if not isinstance(rl.rules[r], dict):
            raise TypeError("Each rule must be a dict, got: " + str(type(rl.rules[r])))
        
        if not "prediction" in rl.rules[r]:
            raise ValueError("Rule dicts must contain 'prediction' key")
        if not "antecedents" in rl.rules[r]:
            raise ValueError("Rule dicts must contain 'antecedents' key")
            
        if not isinstance(rl.rules[r]["prediction"], (bool, int)):
            raise TypeError("Rule predictions must be bools or ints, got: " + str(type(rl.rules[r]["prediction"])))
        if not isinstance(rl.rules[r]["antecedents"], list): 
            raise TypeError("Rule antecedents must be lists, got: " + str(type(rl.rules[r]["antecedents"])))
        

        a_len = len(rl.rules[r]["antecedents"])
        for i in range(a_len):
            rule = rl.rules[r]["antecedents"][i]
            if not isinstance(rule, int):
                raise TypeError("Rule id must be an int, got: " + str(type(rule)))
            if abs(rule) > n_features:
                raise ValueError("Rule id out of bounds: " + str(rule))

        if r == (len(rl.rules) - 1) and (a_len != 1 or rl.rules[r]["antecedents"][0] != 0):
            raise ValueError("Last rule in the rulelist must be the default prediction,"
                             " with antecedents '[0]', got: " + str(rl.rules[r]["antecedents"]))

[docs]class RuleList:
    """This class represents a rulelist. It is used to store the model generated by 
    `CorelsClassifier.fit`.
    
    Attributes
    ----------
    rules : list
        Set of rule indices (into the features array) that comprise the rulelist.
    
    features : list
        Set of all features. An array of strings.
    
    prediction_name : str
        Name of the feature being predicted.
    """

    def __init__(self, rules=[], features=[], prediction_name=""):
        self.rules = rules
        self.features = features
        self.prediction_name = prediction_name

[docs]    def save(self, fname):
        """
        Save the rulelist to a file, using python's pickle module.

        Parameters
        ----------
        fname : string
            File name to store the rulelist in
        
        Returns
        -------
        self : obj
        """

        check_rulelist(self)

        with open(fname, "wb") as f:
            pickle.dump({ "f": self.features, "r": self.rules, "p": self.prediction_name }, f)

        return self

[docs]    def load(self, fname):
        """
        Load a rulelist from a file, using python's pickle module.
        
        Parameters
        ----------
        fname : string
            File name to load the rulelist from
        
        Returns
        -------
        self : obj
        """

        with open(fname, "rb") as f:
            rl_dict = pickle.load(f)
            if not "r" in rl_dict or not "f" in rl_dict or not "p" in rl_dict:
                raise ValueError("Invalid rulelist file")
            
            rl = RuleList()
            rl.rules = rl_dict["r"]
            rl.features = rl_dict["f"]
            rl.prediction_name = rl_dict["p"]
            check_rulelist(rl)

            self.rules = rl.rules
            self.features = rl.features
            self.prediction_name = rl.prediction_name

        return self

    def __str__(self):
        check_rulelist(self)

        tot = "RULELIST:\n"
        
        if len(self.rules) == 1:
            tot += self.prediction_name + " = " + str(self.rules[0]["prediction"])
        else:    
            for i in range(len(self.rules) - 1):
                feat = get_feature(self.features, self.rules[i]["antecedents"][0])
                for j in range(1, len(self.rules[i]["antecedents"])):
                    feat += " && " + get_feature(self.features, self.rules[i]["antecedents"][j])
                tot += "if [" + feat + "]:\n  " + self.prediction_name + " = " + str(bool(self.rules[i]["prediction"])) + "\nelse "

            tot += "\n  " + self.prediction_name + " = " + str(bool(self.rules[-1]["prediction"]))


        return tot
    
    def __repr__(self):
        return self.__str__() + "\nAll features: (" + str(self.features) + ")"

[docs]def load_from_csv(fname):
    """
    Load a dataset from a csv file. The csv file must contain n_samples+1 rows, each with n_features+1
    columns. The last column of each sample is its prediction class, and the first row of the file
    contains the feature names and prediction class name.
    
    Parameters
    ----------
    fname : str
        File name of the csv data file
    
    Returns
    -------
    X : array-like, shape = [n_samples, n_features]
        The sample data

    y : array-line, shape = [n_samples]
        The target values for the sample data
    
    features : list
        A list of strings of length n_features. Specifies the names of each of the features.

    prediction_name : str
        The name of the prediction class
    """

    import csv
    features = []
    prediction_name = ""

    with open(fname, "r") as f:
        features = f.readline().strip().split(",")
        prediction_name = features[-1]
        features = features[0:-1]

    data = np.genfromtxt(fname, dtype=np.uint8, skip_header=1, delimiter=",")

    X = data[:, 0:-1]
    y = data[:, -1]

    return X, y, features, prediction_name