import pandas as pd
import numpy as np


df = pd.read_csv('grades.csv')
df


df.drop(2)


df.drop([2,3,4])


df.drop('passed', axis=1)


df.drop(['Ex 2', 'Ex 3'], axis=1)


df['passed'].apply(lambda x: x=='true')

0     True
1    False
2     True
3     True
4    False
Name: passed, dtype: bool

df


df['passed'] = df['passed'].apply(lambda x: x=='true')
df


df['average'] = (df['Ex 1'] + df['Ex 2']+df['Ex 3']+df['Ex 4'])/ 4
df


df['mention'] = df['average'] > 70
df


from sklearn.preprocessing import StandardScaler
sScaler = StandardScaler()


firstTerm = df[['Ex 1', 'Ex 2']]
sScaler.fit(firstTerm)

StandardScaler()


sScaler.transform(firstTerm)

array([[ 0.76533169,  0.19834601],
       [-1.79747626, -1.02673227],
       [ 0.90575952,  0.78171661],
       [ 0.48447602,  1.30675016],
       [-0.35809097, -1.26008051]])


from sklearn.neighbors import KNeighborsClassifier


kn = KNeighborsClassifier(n_neighbors=2)
kn.fit(df[['Ex 1','Ex 2', 'Ex 3', 'Ex 4']],df['passed'])

KNeighborsClassifier(n_neighbors=2)


kn.predict([
    [10,10,10,10],
    [50,60,70,80]
])

array([False,  True])


import sklearn
import numpy as np
import matplotlib.pyplot as plt


from sklearn.preprocessing import StandardScaler
stdScaler = StandardScaler()
randomData = np.random.normal(2,3,size=(1000,1) )
stdScaler.fit(randomData)

StandardScaler()


stdScaler.mean_, stdScaler.scale_

(array([1.9907856]), array([2.92175702]))


stdScaler.transform([
    [2],
    [5],
    [-1]
])

array([[ 0.00315372],
       [ 1.02993315],
       [-1.02362571]])


def fn(x):
    return 7 - 8*x - 0.5*x**2 + 0.5*x**3
  
n_train = 100
np.random.seed(1122)
xs = np.linspace(0, 5)
rxs = 5 * np.random.random(n_train)
X1D = np.array([rxs]).T
ys1D = fn(rxs) + np.random.normal(size = (n_train) )


plt.plot(xs, fn(xs), 'b--')
plt.plot(rxs, ys1D, 'ok')
plt.xlabel('x');
plt.ylabel('y');


from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures

polynomial_features = PolynomialFeatures(degree=8)
X_train = polynomial_features.fit_transform(X1D)

alpha_values = np.logspace(-4, 4, 100)
parameters = {'alpha': alpha_values}
r = Ridge()
Rsearch = GridSearchCV(r, parameters, cv=5)
Rsearch.fit(X_train, ys1D);


Rsearch.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_alpha', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])


scores = Rsearch.cv_results_['mean_test_score']
scores_std = Rsearch.cv_results_['std_test_score']
plt.fill_between(alpha_values, scores - scores_std,
                 scores + scores_std, alpha=0.1, color="g")
plt.plot(alpha_values, scores)
plt.xscale('log')
plt.xlabel(r'Regularisation parameter $\alpha$')
plt.ylabel('Average score');


xval = np.arange(0,5.1,0.1).reshape(-1, 1)
pxval = polynomial_features.transform(xval)
ypred = Rsearch.best_estimator_.predict(pxval)

plt.plot(rxs, ys1D,'ok')  
plt.plot(xval, ypred , color='r')

plt.xlabel('x')
plt.ylabel('y');


from sklearn.pipeline import Pipeline

analysis_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=8)), 
    ('ridge', Ridge())
])


degrees = [5,6,7]
parameters = {
    'ridge__alpha': alpha_values, 
    'poly__degree': degrees
}
Psearch = GridSearchCV(analysis_pipeline, parameters, cv=5)
Psearch.fit(X1D, ys1D);


for j in range(3):
    scores = Psearch.cv_results_['mean_test_score'][j*100:(j+1)*100]
    scores_std = Psearch.cv_results_['std_test_score'][j*100:(j+1)*100]
    plt.fill_between(alpha_values, scores - scores_std,
                 scores + scores_std, alpha=0.1, label="n={}".format(degrees[j]))
    plt.plot(alpha_values, scores)
plt.xscale('log')
plt.legend()
plt.xlabel(r'Regularisation parameter $\alpha$')
plt.ylabel('Test score');


xval = np.arange(0,5.1,0.1).reshape(-1, 1)
ypred = Psearch.best_estimator_.predict(xval)

plt.plot(rxs, ys1D,'ok')  
plt.plot(xval, ypred , color='r')

plt.xlabel('x')
plt.ylabel('y');


from sklearn.model_selection import learning_curve

train_sizes = np.linspace(.1, 1.0, 10)

train_sizes, train_scores, test_scores = learning_curve(
    Psearch.best_estimator_, X1D, ys1D, cv=5, train_sizes=train_sizes)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)


plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")

plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
         label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
         label="Cross-validation score")

plt.ylim(0,1); plt.grid(); plt.legend(loc="best");

Summative assignment

Summative assignment Potential Topics

Summative assignment Non-Topics

Using Pandas¶

Using scikit-learn¶

Scikit-learn¶

Preprocessor¶

Models¶

Interface¶

Tools¶

Model selection with `GridSearchCV`¶

Pipelines¶

	name	Ex 1	Ex 2	Ex 3	Ex 4	passed
0	John	86	57	45	32	true
1	Mary	13	36	24	53	false
2	Alice	90	67	87	31	true
3	Bob	78	76	68	89	true
4	Claire	54	32	21	11	false

	name	Ex 1	Ex 2	Ex 3	Ex 4	passed	average
0	John	86	57	45	32	True	55.00
1	Mary	13	36	24	53	False	31.50
2	Alice	90	67	87	31	True	68.75
3	Bob	78	76	68	89	True	77.75
4	Claire	54	32	21	11	False	29.50

Summative assignment

Summative assignment Potential Topics

Summative assignment Non-Topics

Using Pandas¶

Using scikit-learn¶

Scikit-learn¶

Preprocessor¶

Models¶

Interface¶

Tools¶

Model selection with GridSearchCV¶

Pipelines¶

Model selection with `GridSearchCV`¶