from __future__ import print_function, division, with_statement
import numpy as np
import pickle
def check_array(x, ndim=None):
if not hasattr(x, 'shape') and \
(type(x) == str or not hasattr(x, '__len__')) and \
not hasattr(x, '__array__'):
raise TypeError("Array must be provided, got: " + str(type(x)))
x = np.array(x, order='C', copy=False)
if ndim and ndim != x.ndim:
raise ValueError("Array must be " + str(ndim) + "-dimensional in shape, got " + str(x.ndim) +
" dimensions instead")
asbool = x.astype(np.bool)
if not np.array_equal(x, asbool):
raise ValueError("Array must contain only binary members (0 or 1), got " + str(x));
return asbool
def check_consistent_length(x, y):
if x.ndim < 1 or y.ndim < 1:
raise ValueError("Arrays must have at least one dimension")
return x.shape[0] == y.shape[0]
def check_is_fitted(o, n):
if not hasattr(o, n) or not getattr(o, n):
raise ValueError("Model not fitted yet")
def get_feature(features, i):
if not features or abs(i) > len(features):
return ""
if i < 0:
return "not " + features[-i - 1]
else:
return features[i - 1]
def check_in(name, allowed, val):
if not val.lower() in allowed:
allowed_str = "'" + "' '".join(allowed) + "'"
raise ValueError(name + " must be chosen from " + allowed_str +
", got: " + val)
def check_features(features):
if not isinstance(features, list):
raise TypeError("Features must be a list, got: " + str(type(features)))
for i in range(len(features)):
if not isinstance(features[i], str):
raise TypeError("Each feature much be a string, got: " + str(type(features[i])))
def check_rulelist(rl):
if not hasattr(rl, "rules") or not hasattr(rl, "features") or not hasattr(rl, "prediction_name"):
raise ValueError("Rulelist must have the following attributes: 'rules', 'features', 'prediction_name'")
if not isinstance(rl.rules, list):
raise TypeError("Rulelist rules must be a list, got: " + str(type(rl.rules)))
if not isinstance(rl.prediction_name, str):
raise TypeError("Prediction name must be a string, got: " + str(type(rl.prediction_name)))
check_features(rl.features)
n_features = len(rl.features)
if len(rl.rules) < 1:
raise ValueError("Rulelist must contain at least the default rule")
for r in range(len(rl.rules)):
if not isinstance(rl.rules[r], dict):
raise TypeError("Each rule must be a dict, got: " + str(type(rl.rules[r])))
if not "prediction" in rl.rules[r]:
raise ValueError("Rule dicts must contain 'prediction' key")
if not "antecedents" in rl.rules[r]:
raise ValueError("Rule dicts must contain 'antecedents' key")
if not isinstance(rl.rules[r]["prediction"], (bool, int)):
raise TypeError("Rule predictions must be bools or ints, got: " + str(type(rl.rules[r]["prediction"])))
if not isinstance(rl.rules[r]["antecedents"], list):
raise TypeError("Rule antecedents must be lists, got: " + str(type(rl.rules[r]["antecedents"])))
a_len = len(rl.rules[r]["antecedents"])
for i in range(a_len):
rule = rl.rules[r]["antecedents"][i]
if not isinstance(rule, int):
raise TypeError("Rule id must be an int, got: " + str(type(rule)))
if abs(rule) > n_features:
raise ValueError("Rule id out of bounds: " + str(rule))
if r == (len(rl.rules) - 1) and (a_len != 1 or rl.rules[r]["antecedents"][0] != 0):
raise ValueError("Last rule in the rulelist must be the default prediction,"
" with antecedents '[0]', got: " + str(rl.rules[r]["antecedents"]))
[docs]class RuleList:
"""This class represents a rulelist. It is used to store the model generated by
`CorelsClassifier.fit`.
Attributes
----------
rules : list
Set of rule indices (into the features array) that comprise the rulelist.
features : list
Set of all features. An array of strings.
prediction_name : str
Name of the feature being predicted.
"""
def __init__(self, rules=[], features=[], prediction_name=""):
self.rules = rules
self.features = features
self.prediction_name = prediction_name
[docs] def save(self, fname):
"""
Save the rulelist to a file, using python's pickle module.
Parameters
----------
fname : string
File name to store the rulelist in
Returns
-------
self : obj
"""
check_rulelist(self)
with open(fname, "wb") as f:
pickle.dump({ "f": self.features, "r": self.rules, "p": self.prediction_name }, f)
return self
[docs] def load(self, fname):
"""
Load a rulelist from a file, using python's pickle module.
Parameters
----------
fname : string
File name to load the rulelist from
Returns
-------
self : obj
"""
with open(fname, "rb") as f:
rl_dict = pickle.load(f)
if not "r" in rl_dict or not "f" in rl_dict or not "p" in rl_dict:
raise ValueError("Invalid rulelist file")
rl = RuleList()
rl.rules = rl_dict["r"]
rl.features = rl_dict["f"]
rl.prediction_name = rl_dict["p"]
check_rulelist(rl)
self.rules = rl.rules
self.features = rl.features
self.prediction_name = rl.prediction_name
return self
def __str__(self):
check_rulelist(self)
tot = "RULELIST:\n"
if len(self.rules) == 1:
tot += self.prediction_name + " = " + str(self.rules[0]["prediction"])
else:
for i in range(len(self.rules) - 1):
feat = get_feature(self.features, self.rules[i]["antecedents"][0])
for j in range(1, len(self.rules[i]["antecedents"])):
feat += " && " + get_feature(self.features, self.rules[i]["antecedents"][j])
tot += "if [" + feat + "]:\n " + self.prediction_name + " = " + str(bool(self.rules[i]["prediction"])) + "\nelse "
tot += "\n " + self.prediction_name + " = " + str(bool(self.rules[-1]["prediction"]))
return tot
def __repr__(self):
return self.__str__() + "\nAll features: (" + str(self.features) + ")"
[docs]def load_from_csv(fname):
"""
Load a dataset from a csv file. The csv file must contain n_samples+1 rows, each with n_features+1
columns. The last column of each sample is its prediction class, and the first row of the file
contains the feature names and prediction class name.
Parameters
----------
fname : str
File name of the csv data file
Returns
-------
X : array-like, shape = [n_samples, n_features]
The sample data
y : array-line, shape = [n_samples]
The target values for the sample data
features : list
A list of strings of length n_features. Specifies the names of each of the features.
prediction_name : str
The name of the prediction class
"""
import csv
features = []
prediction_name = ""
with open(fname, "r") as f:
features = f.readline().strip().split(",")
prediction_name = features[-1]
features = features[0:-1]
data = np.genfromtxt(fname, dtype=np.uint8, skip_header=1, delimiter=",")
X = data[:, 0:-1]
y = data[:, -1]
return X, y, features, prediction_name