# Solution
# There are various ways to construct a tree, but the most straightforward is to use recursion.
# The function `median+tree` will repeatedly call itself to return the left-right branches on the current tree,
# until no more splitting is needed.

import numpy as np

def gini(S):
    """"Gini impurity of a list. See Chapter 5 exercises."""
    g = 0
    unique_el = set(S)
    for el in unique_el:
        p = S.count(el)/len(S)
        g += p*(1-p)
    return g

def median_tree(X, y, max_depth=5, level=0):

    # if y is pure or max_depth reached => no more splitting
    if len(set(y)) == 1 or level == max_depth:
        return {
            'level': level,
            'subsetX': X,
            'subsety': y,
            'gini': gini(y),
            'split_coord': None,
            'split_value': None,
            'left': None,
            'right': None
        }
    
    # otherwise try splitting
    n, d = X.shape
    best_total = float('inf') # total impurity of split
    best_split = None
    
    # test each coordinate and find the best one
    for coord in range(d):
        median = np.median(X[:, coord])
        left_mask = X[:, coord] <= median
        right_mask = X[:, coord] > median
        
        left_y = [y[i] for i in range(n) if left_mask[i]]
        right_y = [y[i] for i in range(n) if right_mask[i]]

        total = len(left_y) * gini(left_y) + len(right_y) * gini(right_y)
        
        if total < best_total:
            best_total = total
            best_split = (coord, median, left_mask, right_mask)
    
    coord, median, left_mask, right_mask = best_split
    left_tree = median_tree(X[left_mask,:], [y[i] for i in range(n) if left_mask[i]], max_depth, level + 1)
    right_tree = median_tree(X[right_mask,:], [y[i] for i in range(n) if right_mask[i]], max_depth, level + 1)
    
    return {
        'level': level,
        'subsetX': X,
        'subsety': y,
        'gini': gini(y),
        'split_coord': coord,
        'split_value': median,
        'left': left_tree,
        'right': right_tree
    }

# Example:
X = np.array([[2, 3], [10, 15], [3, 4], [8, 9], [6, 7]])
y = [0, 1, 0, 1, 0]
y = ['a', 'b', 'a', 'b', 'a']
tree = median_tree(X, y, max_depth=2)
print(tree)

{'level': 0, 'subsetX': array([[ 2,  3],
       [10, 15],
       [ 3,  4],
       [ 8,  9],
       [ 6,  7]]), 'subsety': ['a', 'b', 'a', 'b', 'a'], 'gini': 0.48, 'split_coord': 0, 'split_value': 6.0, 'left': {'level': 1, 'subsetX': array([[2, 3],
       [3, 4],
       [6, 7]]), 'subsety': ['a', 'a', 'a'], 'gini': 0.0, 'split_coord': None, 'split_value': None, 'left': None, 'right': None}, 'right': {'level': 1, 'subsetX': array([[10, 15],
       [ 8,  9]]), 'subsety': ['b', 'b'], 'gini': 0.0, 'split_coord': None, 'split_value': None, 'left': None, 'right': None}}

# Solution
def median_tree_predict(tree, X):
    def predict_single(tree, x):
        if tree['split_coord'] is None:
            return max(set(tree['subsety']), key=tree['subsety'].count)
        if x[tree['split_coord']] <= tree['split_value']:
            return predict_single(tree['left'], x)
        else:
            return predict_single(tree['right'], x)
    
    return [predict_single(tree, x) for x in X]

# Example usage:
yhat = median_tree_predict(tree, X)
print(yhat)

['a', 'b', 'a', 'b', 'a']

# Solution
import pandas as pd
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

penguins = sns.load_dataset("penguins")
penguins = penguins.dropna()
features = [
  "bill_length_mm",
  "bill_depth_mm",
  "flipper_length_mm",
  "body_mass_g"
]
X = penguins[features]
y = penguins["species"]

X_train, X_test, y_train, y_test = train_test_split(
  X, y,
  test_size=0.2,
  shuffle=True,
  random_state=19716
)

for max_depth in range(1,6):

  print("max_depth =", max_depth)

  dtree = DecisionTreeClassifier(max_depth=max_depth)

  dtree.fit(X_train, y_train)
  yhat = dtree.predict(X_test)
  print(f"   accuracy of DecisionTreeClassifier is {accuracy_score(y_test, yhat):.1%}")

  # same as np.mean(yhat == y_test.values)

  tree = median_tree(X_train.values, list(y_train.values), max_depth=max_depth)
  yhat = median_tree_predict(tree, X_test.values)
  print(f"   accuracy of median_tree classifier is {accuracy_score(y_test, yhat):.1%}")

max_depth = 1
   accuracy of DecisionTreeClassifier is 82.1%
   accuracy of median_tree classifier is 76.1%
max_depth = 2
   accuracy of DecisionTreeClassifier is 92.5%
   accuracy of median_tree classifier is 85.1%
max_depth = 3
   accuracy of DecisionTreeClassifier is 92.5%
   accuracy of median_tree classifier is 82.1%
max_depth = 4
   accuracy of DecisionTreeClassifier is 94.0%
   accuracy of median_tree classifier is 94.0%
max_depth = 5
   accuracy of DecisionTreeClassifier is 95.5%
   accuracy of median_tree classifier is 92.5%

# Solution
import numpy as np

def mypca(X, dhat):
    X = X - X.mean(axis=0) # mean center
    U, S, Vh = np.linalg.svd(X, full_matrices=False)
    R = Vh[0:dhat,:].T
    return R

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

spam = pd.read_csv("_datasets/spambase.csv")
X = spam.drop("class", axis=1)
y = spam["class"] == 1

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2, 
    shuffle=True, random_state=302
    )

dtree = DecisionTreeClassifier(max_depth=7)

for variant in {0, 1}:

    if variant == 0:
        print("without z-normalization:")
    else:
        print("\nwith z-normalization:")
        # z-normalize
        mu = X_train.mean(axis=0)
        sigma = X_train.std(axis=0)
        X_train = (X_train - mu)/sigma
        X_test = (X_test - mu)/sigma   # important: normalise test set with training set mean and std

    for dhat in range(1,6):
        R = mypca(X_train, dhat)
        dtree.fit(X_train@R, y_train)
        yhat = dtree.predict(X_test@R)
        print("   dhat =", dhat, "- accuracy =", (yhat == y_test).mean())

without z-normalization:
   dhat = 1 - accuracy = 0.6699239956568946
   dhat = 2 - accuracy = 0.7339847991313789
   dhat = 3 - accuracy = 0.7328990228013029
   dhat = 4 - accuracy = 0.8306188925081434
   dhat = 5 - accuracy = 0.8610206297502715

with z-normalization:
   dhat = 1 - accuracy = 0.8393051031487514
   dhat = 2 - accuracy = 0.8686210640608035
   dhat = 3 - accuracy = 0.8642779587404995
   dhat = 4 - accuracy = 0.8762214983713354
   dhat = 5 - accuracy = 0.8946796959826275

import numpy as np
import pandas as pd
signals = pd.read_csv("_datasets/pulsars.csv")

# TODO: Provide your solution code here.

# Solution

# part 1
X = signals.drop(columns=["class"])
y = signals["class"]
pulsar_fraction = y.mean()
print(f"{pulsar_fraction:.1%} of the samples are pulsars.\n")

# part 2
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19716)

assert X_train.shape == (14318, 8)
assert y_test.sum() == 326

# part 3
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score
from sklearn.neighbors import KNeighborsClassifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=8))
])

pipeline.fit(X_train, y_train)
knn_score = precision_score(y_test, pipeline.predict(X_test))
print(f"Test score of kNN (k=8): {knn_score:.2%}\n")

# part 4
from sklearn.model_selection import GridSearchCV

param_grid = {
    'knn__n_neighbors': np.arange(3, 21),
    'knn__weights': ['uniform', 'distance'],
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=6, scoring='precision', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
grid_score = precision_score(y_test, grid_search.predict(X_test))

print(f"Best parameters: {best_params}")
print(f"Test score with those parameters: {grid_score:.2%}\n")

# part 5
from sklearn.ensemble import BaggingClassifier
ensemble = BaggingClassifier( 
    grid_search.best_estimator_,
    max_samples=0.5,
    max_features=0.5,
    n_estimators=200,
    random_state=302
    )

ensemble.fit(X_train, y_train)
ensemble_score = precision_score(y_test, ensemble.predict(X_test))
print(f"Test score with bagging: {ensemble_score:.2%}")

9.2% of the samples are pulsars.

Test score of kNN (k=8): 92.98%

Best parameters: {'knn__n_neighbors': 14, 'knn__weights': 'uniform'}
Test score with those parameters: 93.68%

Test score with bagging: 95.22%

# Testing
assert X.shape == (17898, 8)
assert y.shape == (17898,)
assert np.isclose(pulsar_fraction, 0.091574478)

assert X_train.shape == (14318, 8)
assert y_test.sum() == 326

assert np.isclose(knn_score, 0.92982, rtol=1e-5)

assert type(best_params) == dict, "Get the best parameters from the fitted model"
assert grid_score > knn_score, "Score should have improved"
assert np.isclose(grid_score, 0.93684, rtol=1e-5)

assert ensemble_score > grid_score, "Score should have improved"
assert np.isclose(ensemble_score, 0.952206, rtol=1e-5)

Exercise 6.1¶

Exercise 6.2¶

Exercise 6.3¶

Exercise 6.4¶

Exercise 6.5¶

Exercise 6.6¶

Exercise 6.7¶

Exercise 6.8¶