import pandas as pd
pd._version.get_versions()
{'dirty': False, 'error': None, 'full-revisionid': '171c71611886aab8549a8620c5b0071a129ad685', 'version': '0.25.1'}
import urllib.request
print(urllib.request.urlopen(
"http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data").read().decode('ascii')[:100], '...')
5.1,3.5,1.4,0.2,Iris-setosa 4.9,3.0,1.4,0.2,Iris-setosa 4.7,3.2,1.3,0.2,Iris-setosa 4.6,3.1,1.5,0.2, ...
print('...', urllib.request.urlopen(
"http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names").read().decode('ascii')
[2410:2620], '...', sep='\n')
... 7. Attribute Information: 1. sepal length in cm 2. sepal width in cm 3. petal length in cm 4. petal width in cm 5. class: -- Iris Setosa -- Iris Versicolour -- Iris Virginica ...
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",
header=None,
names=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species'])
type(df)
pandas.core.frame.DataFrame
DataFrame
s¶df.head(5)
Sepal Length | Sepal Width | Petal Length | Petal Width | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
df.head(5).values ## extract numpy array
array([[5.1, 3.5, 1.4, 0.2, 'Iris-setosa'], [4.9, 3.0, 1.4, 0.2, 'Iris-setosa'], [4.7, 3.2, 1.3, 0.2, 'Iris-setosa'], [4.6, 3.1, 1.5, 0.2, 'Iris-setosa'], [5.0, 3.6, 1.4, 0.2, 'Iris-setosa']], dtype=object)
df['Petal Length'] ## extract entire columns by name
0 1.4 1 1.4 2 1.3 3 1.5 4 1.4 ... 145 5.2 146 5.0 147 5.2 148 5.4 149 5.1 Name: Petal Length, Length: 150, dtype: float64
df.head(5).Species
0 Iris-setosa 1 Iris-setosa 2 Iris-setosa 3 Iris-setosa 4 Iris-setosa Name: Species, dtype: object
df.head(5)[['Species', 'Petal Length']] ## extract more than one column
Species | Petal Length | |
---|---|---|
0 | Iris-setosa | 1.4 |
1 | Iris-setosa | 1.4 |
2 | Iris-setosa | 1.3 |
3 | Iris-setosa | 1.5 |
4 | Iris-setosa | 1.4 |
df.T ## Transpose with labels
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Sepal Length | 5.1 | 4.9 | 4.7 | 4.6 | 5 | 5.4 | 4.6 | 5 | 4.4 | 4.9 | ... | 6.7 | 6.9 | 5.8 | 6.8 | 6.7 | 6.7 | 6.3 | 6.5 | 6.2 | 5.9 |
Sepal Width | 3.5 | 3 | 3.2 | 3.1 | 3.6 | 3.9 | 3.4 | 3.4 | 2.9 | 3.1 | ... | 3.1 | 3.1 | 2.7 | 3.2 | 3.3 | 3 | 2.5 | 3 | 3.4 | 3 |
Petal Length | 1.4 | 1.4 | 1.3 | 1.5 | 1.4 | 1.7 | 1.4 | 1.5 | 1.4 | 1.5 | ... | 5.6 | 5.1 | 5.1 | 5.9 | 5.7 | 5.2 | 5 | 5.2 | 5.4 | 5.1 |
Petal Width | 0.2 | 0.2 | 0.2 | 0.2 | 0.2 | 0.4 | 0.3 | 0.2 | 0.2 | 0.1 | ... | 2.4 | 2.3 | 1.9 | 2.3 | 2.5 | 2.3 | 1.9 | 2 | 2.3 | 1.8 |
Species | Iris-setosa | Iris-setosa | Iris-setosa | Iris-setosa | Iris-setosa | Iris-setosa | Iris-setosa | Iris-setosa | Iris-setosa | Iris-setosa | ... | Iris-virginica | Iris-virginica | Iris-virginica | Iris-virginica | Iris-virginica | Iris-virginica | Iris-virginica | Iris-virginica | Iris-virginica | Iris-virginica |
5 rows × 150 columns
df.index ## DataFrame rows are indexed
RangeIndex(start=0, stop=150, step=1)
df.iloc[4] ## Extract individual rows by their index
Sepal Length 5 Sepal Width 3.6 Petal Length 1.4 Petal Width 0.2 Species Iris-setosa Name: 4, dtype: object
df.iloc[[4, 17, 23]]
Sepal Length | Sepal Width | Petal Length | Petal Width | Species | |
---|---|---|---|---|---|
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
17 | 5.1 | 3.5 | 1.4 | 0.3 | Iris-setosa |
23 | 5.1 | 3.3 | 1.7 | 0.5 | Iris-setosa |
df.head(8)
Sepal Length | Sepal Width | Petal Length | Petal Width | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
5 | 5.4 | 3.9 | 1.7 | 0.4 | Iris-setosa |
6 | 4.6 | 3.4 | 1.4 | 0.3 | Iris-setosa |
7 | 5.0 | 3.4 | 1.5 | 0.2 | Iris-setosa |
#(df.head(8)['Sepal Length'] < 5) #|
(df.head(8)['Sepal Length'] > 4.6)
0 True 1 True 2 True 3 False 4 True 5 True 6 False 7 True Name: Sepal Length, dtype: bool
df[~(df['Sepal Length'] < 5)].head(3) ## Boolean indexing
Sepal Length | Sepal Width | Petal Length | Petal Width | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
5 | 5.4 | 3.9 | 1.7 | 0.4 | Iris-setosa |
df2 = df.copy() ## Cloning DataFrames
all_species = list(df.Species.unique())
print(all_species)
## Adding colums ## Transforming existing columns
df2['Class'] = df.Species.map(lambda s: all_species.index(s))
df2.tail(5)
['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
Sepal Length | Sepal Width | Petal Length | Petal Width | Species | Class | |
---|---|---|---|---|---|---|
145 | 6.7 | 3.0 | 5.2 | 2.3 | Iris-virginica | 2 |
146 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica | 2 |
147 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica | 2 |
148 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica | 2 |
149 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica | 2 |
df.describe().round(2) ## Quick summary statistics
Sepal Length | Sepal Width | Petal Length | Petal Width | |
---|---|---|---|---|
count | 150.00 | 150.00 | 150.00 | 150.00 |
mean | 5.84 | 3.05 | 3.76 | 1.20 |
std | 0.83 | 0.43 | 1.76 | 0.76 |
min | 4.30 | 2.00 | 1.00 | 0.10 |
25% | 5.10 | 2.80 | 1.60 | 0.30 |
50% | 5.80 | 3.00 | 4.35 | 1.30 |
75% | 6.40 | 3.30 | 5.10 | 1.80 |
max | 7.90 | 4.40 | 6.90 | 2.50 |
Note: (25th percentile == x) $\leftrightarrow$ (25% of the data are $\leq$ x) etc.
df.groupby("Species").mean().T
Species | Iris-setosa | Iris-versicolor | Iris-virginica |
---|---|---|---|
Sepal Length | 5.006 | 5.936 | 6.588 |
Sepal Width | 3.418 | 2.770 | 2.974 |
Petal Length | 1.464 | 4.260 | 5.552 |
Petal Width | 0.244 | 1.326 | 2.026 |
df2['dot_color'] = df2.Species.map(lambda s: {'Iris-setosa': 'red',
'Iris-virginica': 'green',
'Iris-versicolor': 'blue'}[s])
## Quick plotting
df2.plot('Petal Width', 'Sepal Length', kind='scatter', c=df2.dot_color, legend=True);
df.groupby('Species').plot.box();
## join multiple boolean dataframes with | (or), & (and)
subset = df2[(df2.Species == 'Iris-setosa') | (df2.Species == 'Iris-virginica')]
## alternative: negate with ~
# subset = df2[~(df2.Species == 'Iris-versicolor')]
subset = subset[['Sepal Length', 'Class'] ]
subset
Sepal Length | Class | |
---|---|---|
0 | 5.1 | 0 |
1 | 4.9 | 0 |
2 | 4.7 | 0 |
3 | 4.6 | 0 |
4 | 5.0 | 0 |
... | ... | ... |
145 | 6.7 | 2 |
146 | 6.3 | 2 |
147 | 6.5 | 2 |
148 | 6.2 | 2 |
149 | 5.9 | 2 |
100 rows × 2 columns
## make separate vectors for x and y, and make the class -1 or 1
x = subset['Sepal Length']
y = subset['Class'] - 1
plt.scatter(x, y);
## Put everything in one matrix, with constant column prepended
import numpy as np
examples = np.vstack([np.ones_like(x), x, y])
examples = examples.T
print(examples.shape)
(100, 3)
examples[:5]
array([[ 1. , 5.1, -1. ], [ 1. , 4.9, -1. ], [ 1. , 4.7, -1. ], [ 1. , 4.6, -1. ], [ 1. , 5. , -1. ]])
We implement the basic LMS Algorithm, using RSS to measure the error.
Design decisions:
## using numpy:
y_true = np.array([1, -1, 1, 1])
y_pred = np.array([.8, -.5, -.4, .6])
y_true - y_pred
array([ 0.2, -0.5, 1.4, 0.4])
(y_true - y_pred)**2
array([0.04, 0.25, 1.96, 0.16])
((y_true - y_pred)**2).sum()
2.4099999999999997
def rss(examples, w):
"""Compute the residual sum of squares for a linear model.
Arguments:
examples -- (n, p + 1)-matrix of predictors and response
w -- p-vector of linear model weights
"""
x = examples[:,:-1] ### First p columns: shape = (n, p)
y = examples[:,-1] ### Last column: shape = (n,)
y_pred = w.dot(x.T)
rss = (y - y_pred)**2
### Note: w.dot(x) has shape = (n,)
return rss.sum()
def lms(examples, eta, iterations, print_every=1000):
np.random.seed(2)
rows, columns = examples.shape
p = columns - 1 ### last column is the response variable
w = np.random.uniform(low=-1.0, high=1.0, size=p)
for iteration in range(iterations):
rand = np.random.randint(0, rows) ### select random index
x = examples[rand,:-1] ### Everything but the last column
c = examples[rand,-1:] ### The last column
y = w.dot(x)
error = c - y ### Error in the single chosen example
w += (eta * error * x)
if iteration % print_every == 0 or iteration == (iterations-1):
err = rss(examples, w)
print(f"Iteration: {iteration} RSS: {err:.2f}")
return w
%%time
w = lms(examples, eta=0.05, iterations=10000)
Iteration: 0 RSS: 188.48 Iteration: 1000 RSS: 5033.94 Iteration: 2000 RSS: 212.52 Iteration: 3000 RSS: 856.83 Iteration: 4000 RSS: 170.41 Iteration: 5000 RSS: 583.72 Iteration: 6000 RSS: 732.37 Iteration: 7000 RSS: 468.00 Iteration: 8000 RSS: 45.65 Iteration: 9000 RSS: 45.88 Iteration: 9999 RSS: 42.48 CPU times: user 514 ms, sys: 30.1 ms, total: 544 ms Wall time: 503 ms
print(w, rss(examples, w))
[-5.64992028 1.03311639] 42.48252281358642
line_x = np.array([min(examples[:,1]), max(examples[:,1])])
line_y = w.dot(np.array([np.ones_like(line_x), line_x]))
print('w =', w)
print('line_x =', line_x)
print('line_y =', line_y)
w = [-5.64992028 1.03311639] line_x = [4.3 7.9] line_y = [-1.20751981 2.51169918]
plt.scatter(examples[:,1], examples[:,2])
plt.xlabel("Sepal Length")
plt.ylabel("Species");
plt.ylim((-1.1,1.1))
plt.axhline(0, c='k', ls='--')
plt.axvline(-w[0]/w[1], c='g', ls='--')
plt.plot(line_x, line_y, 'r');
(np.sign(w.dot(examples[:,:2].T)) == examples[:,2]).sum()
94
import seaborn as sb
## Just put in a DataFrame object,
## seaborn does the right thing automatically:
sb.pairplot(df, hue='Species', height=3, diag_kind='hist');
# lmplot() Does linear regression automatically
sb.lmplot(x='Sepal Width', y='Petal Length', data=df[~(df.Species == 'Iris-setosa')]);