'Regression tree - SSR Impurity measure

i have a code of classification desicion tree (classification), and i'm trying to convert it to a regression tree.

i understand that i need to change the Impurity measure. in classification i have the Gini and Entropy. in regression i need to use SSR. if i understand right, i need to change the information_gain function for calculating the SSR.

can someone help me understand how should i change it?

class DecisionTreeClassifier():
def __init__(self, min_samples_split=2, max_depth=2):
    ''' constructor '''
    
    # the root of the tree 
    self.root = None
    
    # stopping conditions
    # if the num of samples became less then min sample we will stop and it will be a leaf.
    # same with depth
    self.min_samples_split = min_samples_split
    self.max_depth = max_depth
    
def build_tree(self, dataset, curr_depth=0):
    ''' recursive function to build the tree ''' 
    #splitting the features and target
    X, Y = dataset[:,:-1], dataset[:,-1]
    num_samples, num_features = np.shape(X)
    
    # split until stopping conditions are met
    if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
        # find the best split
        best_split = self.get_best_split(dataset, num_samples, num_features)
        # check if information gain is positive, if it eq to 0 it means its pure
        if best_split["info_gain"]>0:
            # recursive left
            left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
            # recursive right
            right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
            # return decision node
            return Node(best_split["feature_index"], best_split["threshold"], 
                        left_subtree, right_subtree, best_split["info_gain"])
    
    # calculate leaf node
    leaf_value = self.calculate_leaf_value(Y)
    # return leaf node
    return Node(value=leaf_value)

def get_best_split(self, dataset, num_samples, num_features):
    ''' function to find the best split '''
    
    # dictionary to store the best split
    best_split = {}
    #we want to maximize the and to find that we have to use a number that less then any other number
    max_info_gain = -float("inf")
    
    # loop over all the features
    for feature_index in range(num_features):
        feature_values = dataset[:, feature_index]
        # return the unique values of particular feature
        possible_thresholds = np.unique(feature_values)
        # loop over all the feature values present in the data
        for threshold in possible_thresholds:
            # get current split
            dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
            # check if childs are not null
            if len(dataset_left)>0 and len(dataset_right)>0:
                #getting the target values
                y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                # y = target values
                # compute information gain
                curr_info_gain = self.information_gain(y, left_y, right_y, "gini")
                # once we get the current information gain we need the check if the currentinformation gain 
                #bigger then the max information gain if yes ? we need to update oyr best split
                if curr_info_gain>max_info_gain:
                    best_split["feature_index"] = feature_index
                    best_split["threshold"] = threshold
                    best_split["dataset_left"] = dataset_left
                    best_split["dataset_right"] = dataset_right
                    best_split["info_gain"] = curr_info_gain
                    max_info_gain = curr_info_gain
                    
    # return best split
    return best_split

def split(self, dataset, feature_index, threshold):
    ''' function to split the data '''
    # takes the dataset and the feature index and the threshold value and split it to two parts ( left and right child)
    # we will split with <> threshold
    dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
    dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
    return dataset_left, dataset_right

def information_gain(self, parent, l_child, r_child, mode="gini"):
    ''' function to compute information gain '''
    # calculate the weights. child/parent
    weight_l = len(l_child) / len(parent)
    weight_r = len(r_child) / len(parent)
    # calculate the Gini 
    if mode=="gini":
        gain = self.gini_index(parent) - (weight_l*self.gini_index(l_child) + weight_r*self.gini_index(r_child))
    else:
        gain = self.entropy(parent) - (weight_l*self.entropy(l_child) + weight_r*self.entropy(r_child))
    return gain

# for that home work we do not need entropy but nice to have
'''def entropy(self, y):
    # function to compute entropy 
    
    class_labels = np.unique(y)
    entropy = 0
    for cls in class_labels:
        p_cls = len(y[y == cls]) / len(y)
        entropy += -p_cls * np.log2(p_cls)
    return entropy'''

def gini_index(self, y):
    ''' function to compute gini index '''
    
    class_labels = np.unique(y)
    gini = 0
    for cls in class_labels:
        p_cls = len(y[y == cls]) / len(y)
        gini += p_cls**2
    return 1 - gini
    
def calculate_leaf_value(self, Y):
    ''' function to compute leaf node '''
    # find the most occuring element in Y
    Y = list(Y)
    return max(Y, key=Y.count)

def print_tree(self, tree=None, indent=" "):
    ''' recursive function to print the tree '''
    
    if not tree:
        tree = self.root

    if tree.value is not None:
        print(tree.value)

    else:
        print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain)
        print("%sleft:" % (indent), end="")
        self.print_tree(tree.left, indent + indent)
        print("%sright:" % (indent), end="")
        self.print_tree(tree.right, indent + indent)

def fit(self, X, Y):
    ''' function to train the tree '''
    
    dataset = np.concatenate((X, Y), axis=1)
    self.root = self.build_tree(dataset)

def predict(self, X):
    ''' function to predict new dataset '''
    
    preditions = [self.make_prediction(x, self.root) for x in X]
    return preditions

def make_prediction(self, x, tree):
    ''' function to predict a single data point '''
    
    if tree.value!=None: return tree.value
    feature_val = x[tree.feature_index]
    if feature_val<=tree.threshold:
        return self.make_prediction(x, tree.left)
    else:
        return self.make_prediction(x, tree.right)


Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source