I am attempting to translate Python's sklearn.tree
export_text
output into SAS conditions.
There has been a solution given in Python for this problem here (at the very end of the page) : How to extract the decision rules from scikit-learn decision-tree?
I tried to adapt the code for generating SAS code, but I have an issue with handling nested DO; END;
Here's my code (I create a DATA STEP) :
def get_sas_from_text(tree, tree_id, features, text, spacing=2):
# tree is a decision tree from a RandomForestClassifier for instance
# tree id is a number I use for naming the table I create
# features is a list of features names
# text is the output of the export_text function from sklearn.tree
# spacing is used to handle the file size
skip, dash = ' '*spacing, '-'*(spacing-1)
code = 'data decision_tree_' + str(tree_id) + ';'
code += ' set input_data; '
n_end_last = 0 # Number of 'END;' to add at the end of the data step
splitted_text = text.split('
')
text_list = []
for i, line in enumerate(splitted_text):
line = line.rstrip().replace('|',' ')
# Handling rows for IF conditions
if '<' in line or '>' in line:
line, val = line.rsplit(maxsplit=1)
line = line.replace(' ' + dash, 'if')
line = '{} {:g} THEN DO;'.format(line, float(val))
n_end_last += 1 # need to add an END;
if i > 0 and 'PREDICTED_VALUE' in text_list[i-1]: # If there's a PREDICTED_VALUE in line above, then condition is ELSE DO
line = "ELSE DO; " + line
n_end_last += 1 # add another END
# Handling rows for PREDICTED_VALUE
else:
line = line.replace(' {} class:'.format(dash), 'PREDICTED_VALUE =')
line += ';'
line += '
end;' # Immediately add END after PREDICTED_VALUE = .. ;
n_end_last -= 1
text_list.append(line)
code += skip + line + '
'
code = code[:-1]
code += 'end; + '
''* n_end_last # add END;
code += 'run;'
return code
Here's an example with the iris data set :
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import _tree
import string
from sklearn.tree import export_text
iris = datasets.load_iris()
data=pd.DataFrame({
'sepal length':iris.data[:,0],
'sepal width':iris.data[:,1],
'petal length':iris.data[:,2],
'petal width':iris.data[:,3],
'species':iris.target
})
X=data[['sepal length', 'sepal width', 'petal length', 'petal width']] # Features
y=data['species'] # Labels
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test
clf=RandomForestClassifier(n_estimators=100)
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)
# Function to export the tree rules' code : in Python (works) and in SAS (issue with DO; END;
def export_code(tree, tree_id, feature_names, max_depth=100, spacing=2):
if spacing < 2:
raise ValueError('spacing must be > 1')
# Clean up feature names (for correctness)
nums = string.digits
alnums = string.ascii_letters + nums
clean = lambda s: ''.join(c if c in alnums else '_' for c in s)
features = [clean(x) for x in feature_names]
features = ['_'+x if x[0] in nums else x for x in features if x]
if len(set(features)) != len(feature_names):
raise ValueError('invalid feature names')
# First: export tree to text
res = export_text(tree, feature_names=features,
max_depth=max_depth,
decimals=6,
spacing=spacing-1)
code_sas = get_sas_from_text(tree, tree_id, features, res, spacing)
code_py = get_py_from_text(tree, tree_id, features, res, spacing)
return res, code_sas, code_py # to take a look at the different code outputs
# Python function
def get_py_from_text(tree, tree_id, features, text, spacing):
skip, dash = ' '*spacing, '-'*(spacing-1)
code = 'def decision_tree_'+ str(tree_id) + '({}):
'.format(', '.join(features))
for line in repr(tree).split('
'):
code += skip + "# " + line + '
'
for line in text.split('
'):
line = line.rstrip().replace('|',' ')
if '<' in line or '>' in line:
line, val = line.rsplit(maxsplit=1)
line = line.replace(' ' + dash, 'if')
line = '{} {:g}:'.format(line, float(val))
else:
line = line.replace(' {} class:'.format(dash), 'return')
code += skip + line + '
'
return code
I then get the generated codes in SAS and Python for a tree's decision rules :
# Rules for first decision tree (there are 100 of them)
exported_text, sas_text, py_text = export_code(clf[0], 0, iris.feature_names)
Here are the decision rules in Python for first tree :
def decision_tree_0(sepal_length__cm_, sepal_width__cm_, petal_length__cm_, petal_width__cm_):
# DecisionTreeClassifier(max_features='auto', random_state=1087864992)
if sepal_length__cm_ <= 5.35:
if sepal_width__cm_ <= 2.7:
return 1.0
if sepal_width__cm_ > 2.7:
return 0.0
if sepal_length__cm_ > 5.35:
if petal_width__cm_ <= 1.75:
if petal_length__cm_ <= 2.5:
return 0.0
if petal_length__cm_ > 2.5:
if sepal_length__cm_ <= 7.1:
if petal_width__cm_ <= 1.45:
if petal_length__cm_ <= 5.15:
return 1.0
if petal_length__cm_ > 5.15:
return 2.0
if petal_width__cm_ > 1.45:
return 1.0
if sepal_length__cm_ > 7.1:
return 2.0
if petal_width__cm_ > 1.75:
return 2.0
The issue I have with my program get_sas_from_text
is that the 'END;' statements are not all well placed within the code, and this does not correspond to the decision rules given as an input (when compared to the corresponding Python function :
data decision_tree_0;
set input_data;
if sepal_length__cm_ <= 5.35 THEN
DO;
if sepal_width__cm_ <= 2.7 THEN
DO;
PREDICTED_VALUE = 1.0;
end;
ELSE
DO;
if sepal_width__cm_ > 2.7 THEN
DO;
PREDICTED_VALUE = 0.0;
end;
ELSE
DO;
if sepal_length__cm_ > 5.35 THEN
DO;
if petal_width__cm_ <= 1.75 THEN
DO;
if petal_length__cm_ <= 2.5 THEN
DO;
PREDICTED_VALUE = 0.0;
end;
ELSE
DO;
if petal_length__cm_ > 2.5 THEN
DO;
if sepal_length__cm_ <= 7.1 THEN
DO;
if petal_width__cm_ <= 1.45 THEN
DO;
if petal_length__cm_ <= 5.15 THEN
DO;
PREDICTED_VALUE = 1.0;
end;
ELSE
DO;
if petal_length__cm_ > 5.15 THEN
DO;
PREDICTED_VALUE = 2.0;
end;
ELSE
DO;
if petal_width__cm_ > 1.45 THEN
DO;
PREDICTED_VALUE = 1.0;
end;
ELSE
DO;
if sepal_length__cm_ > 7.1 THEN
DO;
PREDICTED_VALUE = 2.0;
end;
ELSE
DO;