In [33]:
%matplotlib inline
import matplotlib.pyplot as plt

Two More Useful Python Libraries

Pandas

Pandas!

Pandas

  • Python library for data manipulation and analysis
  • Based on numpy, matplotlib
  • Makes some common data analysis tasks very easy
  • Documentation: pandas.pydata.org
In [34]:
import pandas as pd

pd._version.get_versions()
Out[34]:
{'dirty': False,
 'error': None,
 'full-revisionid': '825876ca7ee8ac7bea463925399c083d5f190b3e',
 'version': '0.19.2'}
In [35]:
import urllib.request
print(urllib.request.urlopen(
    "http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data").read().decode('ascii')[:100], '...')
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2, ...
In [36]:
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",
                   header=None,
                   names=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species'])   
In [37]:
type(df)
Out[37]:
pandas.core.frame.DataFrame

Pandas DataFrames

  • Row/Column datatype
  • Essentially a matrix with labels and indexes
  • Many useful operations
In [38]:
df.head(5)
Out[38]:
Sepal Length Sepal Width Petal Length Petal Width Species
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
In [39]:
df.head(5).values  ## extract numpy array
Out[39]:
array([[5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
       [4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
       [4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
       [4.6, 3.1, 1.5, 0.2, 'Iris-setosa'],
       [5.0, 3.6, 1.4, 0.2, 'Iris-setosa']], dtype=object)
In [40]:
df.head(5)['Petal Length']  ## extract entire columns by name
Out[40]:
0    1.4
1    1.4
2    1.3
3    1.5
4    1.4
Name: Petal Length, dtype: float64
In [41]:
df.head(5).Species
Out[41]:
0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: Species, dtype: object
In [42]:
df.head(5)[['Species', 'Petal Length']]  ## extract more than one column
Out[42]:
Species Petal Length
0 Iris-setosa 1.4
1 Iris-setosa 1.4
2 Iris-setosa 1.3
3 Iris-setosa 1.5
4 Iris-setosa 1.4
In [43]:
df.T  ## Transpose with labels
Out[43]:
0 1 2 3 4 5 6 7 8 9 ... 140 141 142 143 144 145 146 147 148 149
Sepal Length 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 6.7 6.9 5.8 6.8 6.7 6.7 6.3 6.5 6.2 5.9
Sepal Width 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... 3.1 3.1 2.7 3.2 3.3 3 2.5 3 3.4 3
Petal Length 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 5.6 5.1 5.1 5.9 5.7 5.2 5 5.2 5.4 5.1
Petal Width 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... 2.4 2.3 1.9 2.3 2.5 2.3 1.9 2 2.3 1.8
Species Iris-setosa Iris-setosa Iris-setosa Iris-setosa Iris-setosa Iris-setosa Iris-setosa Iris-setosa Iris-setosa Iris-setosa ... Iris-virginica Iris-virginica Iris-virginica Iris-virginica Iris-virginica Iris-virginica Iris-virginica Iris-virginica Iris-virginica Iris-virginica

5 rows × 150 columns

In [44]:
df.index  ## DataFrame rows are indexed
Out[44]:
RangeIndex(start=0, stop=150, step=1)
In [45]:
df.ix[4]  ## Extract individual rows by their index
Out[45]:
Sepal Length              5
Sepal Width             3.6
Petal Length            1.4
Petal Width             0.2
Species         Iris-setosa
Name: 4, dtype: object
In [46]:
df.ix[[4, 17, 23]]
Out[46]:
Sepal Length Sepal Width Petal Length Petal Width Species
4 5.0 3.6 1.4 0.2 Iris-setosa
17 5.1 3.5 1.4 0.3 Iris-setosa
23 5.1 3.3 1.7 0.5 Iris-setosa
In [47]:
df.head(5)
Out[47]:
Sepal Length Sepal Width Petal Length Petal Width Species
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
In [48]:
df.head(5)['Sepal Length'] < 5  ## Boolean queries
Out[48]:
0    False
1     True
2     True
3     True
4    False
Name: Sepal Length, dtype: bool
In [49]:
df[df['Sepal Length'] < 5].head(3) ## Boolean indexing
Out[49]:
Sepal Length Sepal Width Petal Length Petal Width Species
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
In [50]:
df2 = df.copy()  ## Cloning DataFrames

all_species = list(df.Species.unique())
print(all_species)

## Adding colums          ## Transforming existing columns
df2['Type'] = df.Species.map(lambda s: all_species.index(s))
df2.head(5)
['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
Out[50]:
Sepal Length Sepal Width Petal Length Petal Width Species Type
0 5.1 3.5 1.4 0.2 Iris-setosa 0
1 4.9 3.0 1.4 0.2 Iris-setosa 0
2 4.7 3.2 1.3 0.2 Iris-setosa 0
3 4.6 3.1 1.5 0.2 Iris-setosa 0
4 5.0 3.6 1.4 0.2 Iris-setosa 0
In [51]:
df.describe() ## Quick summary statistics
Out[51]:
Sepal Length Sepal Width Petal Length Petal Width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667 1.198667
std 0.828066 0.433594 1.764420 0.763161
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000

Note: 25th percentile = x $\rightarrow$ 25% of the data are $\leq$ x (50th percentile is the median)

In [52]:
df.groupby("Species").describe()
Out[52]:
Petal Length Petal Width Sepal Length Sepal Width
Species
Iris-setosa count 50.000000 50.000000 50.000000 50.000000
mean 1.464000 0.244000 5.006000 3.418000
std 0.173511 0.107210 0.352490 0.381024
min 1.000000 0.100000 4.300000 2.300000
25% 1.400000 0.200000 4.800000 3.125000
50% 1.500000 0.200000 5.000000 3.400000
75% 1.575000 0.300000 5.200000 3.675000
max 1.900000 0.600000 5.800000 4.400000
Iris-versicolor count 50.000000 50.000000 50.000000 50.000000
mean 4.260000 1.326000 5.936000 2.770000
std 0.469911 0.197753 0.516171 0.313798
min 3.000000 1.000000 4.900000 2.000000
25% 4.000000 1.200000 5.600000 2.525000
50% 4.350000 1.300000 5.900000 2.800000
75% 4.600000 1.500000 6.300000 3.000000
max 5.100000 1.800000 7.000000 3.400000
Iris-virginica count 50.000000 50.000000 50.000000 50.000000
mean 5.552000 2.026000 6.588000 2.974000
std 0.551895 0.274650 0.635880 0.322497
min 4.500000 1.400000 4.900000 2.200000
25% 5.100000 1.800000 6.225000 2.800000
50% 5.550000 2.000000 6.500000 3.000000
75% 5.875000 2.300000 6.900000 3.175000
max 6.900000 2.500000 7.900000 3.800000
In [53]:
df2.dot_color = df2.Species.map(lambda s: {'Iris-setosa': 'red', 
                                           'Iris-virginica': 'green',
                                           'Iris-versicolor': 'blue'}[s])
## Quick plotting
df2.plot('Petal Width', 'Petal Length', kind='scatter', c=df2.dot_color);

"Box-and-whisker" plot

In [54]:
df.plot(kind='box');

Seaborn: Statistical Data Visualization

In [55]:
import seaborn as sb
In [56]:
## Just put in a DataFrame object,
## seaborn does the right thing automatically:
sb.pairplot(df, hue='Species', size=3);
In [57]:
sb.lmplot(x='Sepal Width', y='Petal Length', data=df); # Does linear regression automatically

Plotting data facets on grids

In [61]:
g = sb.FacetGrid(df, col='Species', hue="Species")
g.map(plt.scatter, "Petal Length", "Petal Width");
In [59]:
## Add two boolean columns
df2 = df.copy()
df2['PL_low'] = df2['Petal Length'].map(lambda x: x < df2['Petal Length'].mean())
df2['SW_low'] = df2['Sepal Width'].map(lambda x: x < df2['Sepal Width'].mean())

df2.head(5)
Out[59]:
Sepal Length Sepal Width Petal Length Petal Width Species PL_low SW_low
0 5.1 3.5 1.4 0.2 Iris-setosa True False
1 4.9 3.0 1.4 0.2 Iris-setosa True True
2 4.7 3.2 1.3 0.2 Iris-setosa True False
3 4.6 3.1 1.5 0.2 Iris-setosa True False
4 5.0 3.6 1.4 0.2 Iris-setosa True False
In [60]:
g = sb.FacetGrid(df2, col='PL_low', row='SW_low', hue='Species')
g.map(plt.scatter, "Petal Width", "Sepal Length");