%matplotlib inline
import matplotlib.pyplot as plt

import pandas as pd
pd._version.get_versions()
import urllib.request
print(urllib.request.urlopen(
"http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data").read().decode('ascii')[:100], '...')
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",
header=None,
names=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species'])
type(df)
DataFrames¶df.head(5)
df.head(5).values ## extract numpy array
df.head(5)['Petal Length'] ## extract entire columns by name
df.head(5).Species
df.head(5)[['Species', 'Petal Length']] ## extract more than one column
df.T ## Transpose with labels
df.index ## DataFrame rows are indexed
df.ix[4] ## Extract individual rows by their index
df.ix[[4, 17, 23]]
df.head(5)
df.head(5)['Sepal Length'] < 5 ## Boolean queries
df[df['Sepal Length'] < 5].head(3) ## Boolean indexing
df2 = df.copy() ## Cloning DataFrames
all_species = list(df.Species.unique())
print(all_species)
## Adding colums ## Transforming existing columns
df2['Type'] = df.Species.map(lambda s: all_species.index(s))
df2.head(5)
df.describe() ## Quick summary statistics
Note: 25th percentile = x $\rightarrow$ 25% of the data are $\leq$ x (50th percentile is the median)
df.groupby("Species").describe()
df2.dot_color = df2.Species.map(lambda s: {'Iris-setosa': 'red',
'Iris-virginica': 'green',
'Iris-versicolor': 'blue'}[s])
## Quick plotting
df2.plot('Petal Width', 'Petal Length', kind='scatter', c=df2.dot_color);
df.plot(kind='box');
import seaborn as sb
## Just put in a DataFrame object,
## seaborn does the right thing automatically:
sb.pairplot(df, hue='Species', size=3);
sb.lmplot(x='Sepal Width', y='Petal Length', data=df); # Does linear regression automatically
g = sb.FacetGrid(df, col='Species', hue="Species")
g.map(plt.scatter, "Petal Length", "Petal Width");
## Add two boolean columns
df2 = df.copy()
df2['PL_low'] = df2['Petal Length'].map(lambda x: x < df2['Petal Length'].mean())
df2['SW_low'] = df2['Sepal Width'].map(lambda x: x < df2['Sepal Width'].mean())
df2.head(5)
g = sb.FacetGrid(df2, col='PL_low', row='SW_low', hue='Species')
g.map(plt.scatter, "Petal Width", "Sepal Length");