Example: Mushroom Dataset¶

In [3]:

mushrooms_df = pd.read_csv(StringIO("""
Color,Size,Points,Eatability
red,small,yes,toxic
brown,small,no,eatable
brown,large,yes,eatable
green,small,no,eatable
red,large,no,eatable
"""))

mushrooms_examples = list(mushrooms_df.T.to_dict().values())
mushrooms_examples

Out[3]:

[{'Color': 'red', 'Eatability': 'toxic', 'Points': 'yes', 'Size': 'small'},
 {'Color': 'brown', 'Eatability': 'eatable', 'Points': 'no', 'Size': 'small'},
 {'Color': 'brown', 'Eatability': 'eatable', 'Points': 'yes', 'Size': 'large'},
 {'Color': 'green', 'Eatability': 'eatable', 'Points': 'no', 'Size': 'small'},
 {'Color': 'red', 'Eatability': 'eatable', 'Points': 'no', 'Size': 'large'}]

In [4]:

from collections import Counter
import math

def log2(x):
    """Base-2 logarithm with log2(0) == 0, as used for information entropy."""
    if x != 0:
        return math.log2(x)
    return 0


def most_common_class(examples, target_attribute):
    """Returns the most common value of target_attribute among examples.
    
    >>> most_common_class([{'c':1}, {'c': 2}, {'c': 1}], 'c')
    1
    """
    c = Counter(x[target_attribute] for x in examples)
    return c.most_common(1)[0][0]


def priors(items):
    c = Counter(items)
    return {k:(v/sum(c.values())) for k, v in c.items()}

def posteriors(items):
    """Returns dictionary of posterior probabilities with d[(a,b)] = P(A=a|B=b). 
    Parameter "items" is a list of (a,b)-tuples.
    
    >>> list(sorted(posteriors([(1,6), (2,6), (1,6), (3,6), (1,5), (2,5)]).items()))
    [((1, 5), 0.5), ((1, 6), 0.5), ((2, 5), 0.5), ((2, 6), 0.25), ((3, 5), 0.0), ((3, 6), 0.25)]
    """
    
    bvals = set(x[1] for x in items)
    avals = set(x[0] for x in items)
    probs = {}
    for b in bvals:
        a_counts = Counter(a for a, b1 in items if b1 == b)
        s = sum(a_counts.values())
        for a in avals:
            probs[(a, b)] = a_counts.get(a, 0) / s
    return probs

In [5]:

def conditional_entropy(examples, attribute, target):
    """Compute the conditional entropy for _target_ given _attribute_ over _examples.
    
    >>> conditional_entropy(mushrooms_examples, 'Color', 'Eatability')
    0.4
    """
    p_att = priors(x[attribute] for x in examples)
    p_cond = posteriors([(x[target], x[attribute]) for x in examples])
    target_vals = set(x[target] for x in examples)
    attribute_vals = set(x[attribute] for x in examples)
    H_cond = {b:-sum(p_cond[(a,b)]*log2(p_cond[(a,b)]) for a in target_vals) 
              for b in attribute_vals}
    return sum(p_att[b] * H_cond[b] for b in attribute_vals)
    
    
def information_gain(examples, attribute, target):
    cls = [x[target] for x in examples]
    att = [x[attribute] for x in examples]
    classes = set(cls)
    
    ## prior entropy
    p = priors(cls)
    H_class = -sum([p[c]*log2(p[c]) for c in classes])
    
    ## conditional entropy
    H_class_given_att = conditional_entropy(att, cls)
    
    return H_class - H_class_given_att
    
doctest.testmod()

Out[5]:

TestResults(failed=0, attempted=3)

In [6]:

class ID3Node(object):
    def __init__(self):
        self.children = {}
        self.attribute = None
        self.label = None
        
    def create_edge(self, att_val, child):
        self.children[att_val] = child
        
    def classify(self, example):
        if not self.children:
            return self.label
        v = example[self.attribute]
        if v not in self.children:
            return self.label
        return self.children[v].classify(example)

    def size(self):
        return 1 + sum([c.size() for c in self.children.values()])

In [7]:

def values(examples, attribute):
    return set(x[attribute] for x in examples)

def id3(examples, attributes, target):
    if attributes is None:
        attributes = set(examples[0].keys()).difference([target])
    t = ID3Node()
    mcc = most_common_class(examples, target)
    t.label = mcc
    if all(x[target] == mcc for x in examples) or len(attributes) == 0:
        return t
    a_star = min(attributes, key=lambda a: conditional_entropy(examples, a, target))
    t.attribute = a_star
    for a in values(examples, a_star):
        D_a = [x for x in examples if x[a_star] == a]
        if len(D_a) == 0:
            t1 = ID3Node()
            t1.label = mcc
            t1.attribute = ''
            t.create_edge(a, t1)
        else:
            t.create_edge(a, id3(D_a, attributes.difference(a_star), target))
    return t

In [8]:

def print_tree(t, prefix=""):
    if t.attribute:
        print('[{}]'.format(t.attribute, t.label))
    else:
        print('label:{}'.format(t.label))
    for val, c in t.children.items():
        print(prefix, end="")
        valstr = "  - ({}) -> ".format(val)
        print(valstr, end="")
        print_tree(c, prefix=prefix+(" "*len(valstr)))

In [11]:

t = id3(mushrooms_examples, {'Color', 'Size', 'Points'}, 'Eatability')
print_tree(t)

[Points]
  - (no) -> label:eatable
  - (yes) -> [Size]
               - (small) -> label:toxic
               - (large) -> label:eatable

In [12]:

def misclass_rate(tree, examples, target):
    n_right = 0
    n_wrong = 0
    for x in examples:
        pred = tree.classify(x)
        if pred == x[target]:
            n_right += 1
        else:
            n_wrong += 1
    return (n_wrong) / (n_right + n_wrong)

import random

def cross_validate_id3(examples, target, k, seed=0):
    random.seed(seed)
    xs = list(examples) ## create copy to not modify original list    
    random.shuffle(xs)
    folds = []
    foldsize = len(xs) // k  ## integer division
    for i in range(k):
        folds.append(xs[:foldsize])
        del xs[:foldsize]
        
    for i, x in enumerate(xs): ## len not evenly divisible by k
        folds[i % len(folds)].append(x)
        
    attributes = set(examples[0].keys()).difference([target])
    
    misclass_sum = 0
    
    for i in range(k):
        train = [x for j in range(k) if j != i for x in folds[j] ]
        test = folds[i]
        tree = id3(train, attributes, target)
        misclass = misclass_rate(tree, test, target)
        print('Fold {}: n_train={}, n_test={}, misclass_rate={:.2f}'.format(
            i, len(train), len(test), misclass))
        misclass_sum += misclass
    
    return misclass_sum / k

In [13]:

misclass_rate(t, mushrooms_examples, 'Eatability')

Out[13]:

0.0

In [14]:

avg = cross_validate_id3(dogs_examples, 'Class', 6)
print("Average misclassification rate: {:.2f}".format(avg))

Fold 0: n_train=5, n_test=2, misclass_rate=0.50
Fold 1: n_train=6, n_test=1, misclass_rate=0.00
Fold 2: n_train=6, n_test=1, misclass_rate=1.00
Fold 3: n_train=6, n_test=1, misclass_rate=0.00
Fold 4: n_train=6, n_test=1, misclass_rate=1.00
Fold 5: n_train=6, n_test=1, misclass_rate=1.00
Average misclassification rate: 0.58

In [15]:

def pruning(tree, branch, examples, examples_sub, target):
    if branch.attribute: # a sub-tree, pruning if better
        print('a sub tree, key is', branch.attribute)
        old_branch = copy.copy(branch) # backup
        old_tree = copy.copy(tree) # backup
        old_err = misclass_rate(branch, examples, target)
        mcc = most_common_class(examples_sub, target)
        branch.label = mcc
        branch.attribute = ''
        branch.children = {}
        new_err = misclass_rate(branch, examples, target)
        print(new_err, old_err)
        if old_err <= new_err: # old is better, keep the old tree and try pruning its' children
            print('keep it!')
            branch = old_branch
            tree = old_tree
            print_tree(tree)
            a_star = branch.attribute
            
            for val, tt in enumerate(branch.children.items():
                ex = [x for x in examples_sub if x[a_star] == val]
                print('check',val)
                tree = pruning(tree, tt, examples, ex , target)
            return tree
        else: # new is better, pruning and return
            print('pruning it!')
            print_tree(tree)
            return tree
        
    else: # a leaf, return
        print('a leaf, label is', branch.label)
        print_tree(tree)
        return tree

In [16]:

dogs_df = pd.read_table(StringIO("""
Color\tFur\tSize\tClass
brown\tragged\tsmall\twell-behaved
black\tragged\tbig\tdangerous
black\tsmooth\tbig\tdangerous
black\tcurly\tsmall\twell-behaved
white\tcurly\tsmall\twell-behaved
white\tcurly\tbig\tdangerous
white\tsmooth\tsmall\tdangerous
red\tragged\tbig\twell-behaved"""))

dogs_examples = list(dogs_df.T.to_dict().values())

t = id3(dogs_examples, {'Color', 'Fur', 'Size'}, 'Class')
print_tree(t)

[Fur]
  - (curly) -> [Size]
                 - (small) -> label:well-behaved
                 - (big) -> label:dangerous
  - (ragged) -> [Color]
                  - (black) -> label:dangerous
                  - (red) -> label:well-behaved
                  - (brown) -> label:well-behaved
  - (smooth) -> label:dangerous

In [17]:

dogs_df_dev = pd.read_table(StringIO("""
Color\tFur\tSize\tClass
brown\tragged\tsmall\twell-behaved
black\tragged\tbig\twell-behaved
black\tsmooth\tbig\tdangerous
black\tcurly\tsmall\twell-behaved
white\tcurly\tsmall\twell-behaved
white\tcurly\tbig\twell-behaved
white\tsmooth\tsmall\tdangerous
red\tragged\tbig\twell-behaved"""))

dogs_examples_dev = list(dogs_df_dev.T.to_dict().values())
dogs_examples_dev

pruning_branch = t
pruning_ex = dogs_examples_dev
new_t = pruning(t, pruning_branch, dogs_examples_dev, pruning_ex, 'Class')
print('final')
print_tree(new_t)

a sub tree, key is Fur
0.25 0.25
keep it!
[Fur]
  - (curly) -> [Size]
                 - (small) -> label:well-behaved
                 - (big) -> label:dangerous
  - (ragged) -> [Color]
                  - (black) -> label:dangerous
                  - (red) -> label:well-behaved
                  - (brown) -> label:well-behaved
  - (smooth) -> label:dangerous
check curly
a sub tree, key is Size
0.25 0.5
pruning it!
[Fur]
  - (curly) -> label:well-behaved
  - (ragged) -> [Color]
                  - (black) -> label:dangerous
                  - (red) -> label:well-behaved
                  - (brown) -> label:well-behaved
  - (smooth) -> label:dangerous
check ragged
a sub tree, key is Color
0.25 0.375
pruning it!
[Fur]
  - (curly) -> label:well-behaved
  - (ragged) -> label:well-behaved
  - (smooth) -> label:dangerous
check smooth
a leaf, label is dangerous
[Fur]
  - (curly) -> label:well-behaved
  - (ragged) -> label:well-behaved
  - (smooth) -> label:dangerous
final
[Fur]
  - (curly) -> label:well-behaved
  - (ragged) -> label:well-behaved
  - (smooth) -> label:dangerous

Cars dataset¶

In [18]:

from urllib.request import urlopen

cars_columns = ['BuyingPrice', 'MaintentancePrice', 'Doors', 'Persons', 'Luggage', 'Safety', 'Acceptable']

cars = pd.read_csv(urlopen("http://mlr.cs.umass.edu/ml/machine-learning-databases/car/car.data"),
                   names=cars_columns)

In [19]:

cars_examples = list(cars.T.to_dict().values())

In [20]:

tree = id3(cars_examples, set(cars_columns).difference(['Acceptable']), 'Acceptable')

In [21]:

print_tree(tree)

[Safety]
  - (high) -> [Persons]
                - (4) -> [BuyingPrice]
                           - (high) -> [MaintentancePrice]
                                         - (high) -> label:acc
                                         - (low) -> label:acc
                                         - (vhigh) -> label:unacc
                                         - (med) -> label:acc
                           - (low) -> [MaintentancePrice]
                                        - (high) -> [Luggage]
                                                      - (small) -> label:acc
                                                      - (big) -> label:vgood
                                                      - (med) -> [Doors]
                                                                   - (5more) -> label:vgood
                                                                   - (4) -> label:vgood
                                                                   - (2) -> label:acc
                                                                   - (3) -> label:acc
                                        - (low) -> [Luggage]
                                                     - (small) -> label:good
                                                     - (big) -> label:vgood
                                                     - (med) -> [Doors]
                                                                  - (5more) -> label:vgood
                                                                  - (4) -> label:vgood
                                                                  - (2) -> label:good
                                                                  - (3) -> label:good
                                        - (vhigh) -> label:acc
                                        - (med) -> [Luggage]
                                                     - (small) -> label:good
                                                     - (big) -> label:vgood
                                                     - (med) -> [Doors]
                                                                  - (5more) -> label:vgood
                                                                  - (4) -> label:vgood
                                                                  - (2) -> label:good
                                                                  - (3) -> label:good
                           - (vhigh) -> [MaintentancePrice]
                                          - (high) -> label:unacc
                                          - (low) -> label:acc
                                          - (vhigh) -> label:unacc
                                          - (med) -> label:acc
                           - (med) -> [MaintentancePrice]
                                        - (high) -> label:acc
                                        - (low) -> [Luggage]
                                                     - (small) -> label:good
                                                     - (big) -> label:vgood
                                                     - (med) -> [Doors]
                                                                  - (5more) -> label:vgood
                                                                  - (4) -> label:vgood
                                                                  - (2) -> label:good
                                                                  - (3) -> label:good
                                        - (vhigh) -> label:acc
                                        - (med) -> [Luggage]
                                                     - (small) -> label:acc
                                                     - (big) -> label:vgood
                                                     - (med) -> [Doors]
                                                                  - (5more) -> label:vgood
                                                                  - (4) -> label:vgood
                                                                  - (2) -> label:acc
                                                                  - (3) -> label:acc
                - (2) -> label:unacc
                - (more) -> [BuyingPrice]
                              - (high) -> [MaintentancePrice]
                                            - (high) -> [Doors]
                                                          - (5more) -> label:acc
                                                          - (4) -> label:acc
                                                          - (2) -> [Luggage]
                                                                     - (small) -> label:unacc
                                                                     - (big) -> label:acc
                                                                     - (med) -> label:acc
                                                          - (3) -> label:acc
                                            - (low) -> [Doors]
                                                         - (5more) -> label:acc
                                                         - (4) -> label:acc
                                                         - (2) -> [Luggage]
                                                                    - (small) -> label:unacc
                                                                    - (big) -> label:acc
                                                                    - (med) -> label:acc
                                                         - (3) -> label:acc
                                            - (vhigh) -> label:unacc
                                            - (med) -> [Doors]
                                                         - (5more) -> label:acc
                                                         - (4) -> label:acc
                                                         - (2) -> [Luggage]
                                                                    - (small) -> label:unacc
                                                                    - (big) -> label:acc
                                                                    - (med) -> label:acc
                                                         - (3) -> label:acc
                              - (low) -> [MaintentancePrice]
                                           - (high) -> [Luggage]
                                                         - (small) -> [Doors]
                                                                        - (5more) -> label:acc
                                                                        - (4) -> label:acc
                                                                        - (2) -> label:unacc
                                                                        - (3) -> label:acc
                                                         - (big) -> label:vgood
                                                         - (med) -> [Doors]
                                                                      - (5more) -> label:vgood
                                                                      - (4) -> label:vgood
                                                                      - (2) -> label:acc
                                                                      - (3) -> label:vgood
                                           - (low) -> [Luggage]
                                                        - (small) -> [Doors]
                                                                       - (5more) -> label:good
                                                                       - (4) -> label:good
                                                                       - (2) -> label:unacc
                                                                       - (3) -> label:good
                                                        - (big) -> label:vgood
                                                        - (med) -> [Doors]
                                                                     - (5more) -> label:vgood
                                                                     - (4) -> label:vgood
                                                                     - (2) -> label:good
                                                                     - (3) -> label:vgood
                                           - (vhigh) -> [Doors]
                                                          - (5more) -> label:acc
                                                          - (4) -> label:acc
                                                          - (2) -> [Luggage]
                                                                     - (small) -> label:unacc
                                                                     - (big) -> label:acc
                                                                     - (med) -> label:acc
                                                          - (3) -> label:acc
                                           - (med) -> [Luggage]
                                                        - (small) -> [Doors]
                                                                       - (5more) -> label:good
                                                                       - (4) -> label:good
                                                                       - (2) -> label:unacc
                                                                       - (3) -> label:good
                                                        - (big) -> label:vgood
                                                        - (med) -> [Doors]
                                                                     - (5more) -> label:vgood
                                                                     - (4) -> label:vgood
                                                                     - (2) -> label:good
                                                                     - (3) -> label:vgood
                              - (vhigh) -> [MaintentancePrice]
                                             - (high) -> label:unacc
                                             - (low) -> [Doors]
                                                          - (5more) -> label:acc
                                                          - (4) -> label:acc
                                                          - (2) -> [Luggage]
                                                                     - (small) -> label:unacc
                                                                     - (big) -> label:acc
                                                                     - (med) -> label:acc
                                                          - (3) -> label:acc
                                             - (vhigh) -> label:unacc
                                             - (med) -> [Doors]
                                                          - (5more) -> label:acc
                                                          - (4) -> label:acc
                                                          - (2) -> [Luggage]
                                                                     - (small) -> label:unacc
                                                                     - (big) -> label:acc
                                                                     - (med) -> label:acc
                                                          - (3) -> label:acc
                              - (med) -> [MaintentancePrice]
                                           - (high) -> [Doors]
                                                         - (5more) -> label:acc
                                                         - (4) -> label:acc
                                                         - (2) -> [Luggage]
                                                                    - (small) -> label:unacc
                                                                    - (big) -> label:acc
                                                                    - (med) -> label:acc
                                                         - (3) -> label:acc
                                           - (low) -> [Luggage]
                                                        - (small) -> [Doors]
                                                                       - (5more) -> label:good
                                                                       - (4) -> label:good
                                                                       - (2) -> label:unacc
                                                                       - (3) -> label:good
                                                        - (big) -> label:vgood
                                                        - (med) -> [Doors]
                                                                     - (5more) -> label:vgood
                                                                     - (4) -> label:vgood
                                                                     - (2) -> label:good
                                                                     - (3) -> label:vgood
                                           - (vhigh) -> [Doors]
                                                          - (5more) -> label:acc
                                                          - (4) -> label:acc
                                                          - (2) -> [Luggage]
                                                                     - (small) -> label:unacc
                                                                     - (big) -> label:acc
                                                                     - (med) -> label:acc
                                                          - (3) -> label:acc
                                           - (med) -> [Luggage]
                                                        - (small) -> [Doors]
                                                                       - (5more) -> label:acc
                                                                       - (4) -> label:acc
                                                                       - (2) -> label:unacc
                                                                       - (3) -> label:acc
                                                        - (big) -> label:vgood
                                                        - (med) -> [Doors]
                                                                     - (5more) -> label:vgood
                                                                     - (4) -> label:vgood
                                                                     - (2) -> label:acc
                                                                     - (3) -> label:vgood
  - (low) -> label:unacc
  - (med) -> [Persons]
               - (4) -> [BuyingPrice]
                          - (high) -> [Luggage]
                                        - (small) -> label:unacc
                                        - (big) -> [MaintentancePrice]
                                                     - (high) -> label:acc
                                                     - (low) -> label:acc
                                                     - (vhigh) -> label:unacc
                                                     - (med) -> label:acc
                                        - (med) -> [Doors]
                                                     - (5more) -> [MaintentancePrice]
                                                                    - (high) -> label:acc
                                                                    - (low) -> label:acc
                                                                    - (vhigh) -> label:unacc
                                                                    - (med) -> label:acc
                                                     - (4) -> [MaintentancePrice]
                                                                - (high) -> label:acc
                                                                - (low) -> label:acc
                                                                - (vhigh) -> label:unacc
                                                                - (med) -> label:acc
                                                     - (2) -> label:unacc
                                                     - (3) -> label:unacc
                          - (low) -> [MaintentancePrice]
                                       - (high) -> label:acc
                                       - (low) -> [Luggage]
                                                    - (small) -> label:acc
                                                    - (big) -> label:good
                                                    - (med) -> [Doors]
                                                                 - (5more) -> label:good
                                                                 - (4) -> label:good
                                                                 - (2) -> label:acc
                                                                 - (3) -> label:acc
                                       - (vhigh) -> [Luggage]
                                                      - (small) -> label:unacc
                                                      - (big) -> label:acc
                                                      - (med) -> [Doors]
                                                                   - (5more) -> label:acc
                                                                   - (4) -> label:acc
                                                                   - (2) -> label:unacc
                                                                   - (3) -> label:unacc
                                       - (med) -> [Luggage]
                                                    - (small) -> label:acc
                                                    - (big) -> label:good
                                                    - (med) -> [Doors]
                                                                 - (5more) -> label:good
                                                                 - (4) -> label:good
                                                                 - (2) -> label:acc
                                                                 - (3) -> label:acc
                          - (vhigh) -> [MaintentancePrice]
                                         - (high) -> label:unacc
                                         - (low) -> [Luggage]
                                                      - (small) -> label:unacc
                                                      - (big) -> label:acc
                                                      - (med) -> [Doors]
                                                                   - (5more) -> label:acc
                                                                   - (4) -> label:acc
                                                                   - (2) -> label:unacc
                                                                   - (3) -> label:unacc
                                         - (vhigh) -> label:unacc
                                         - (med) -> [Luggage]
                                                      - (small) -> label:unacc
                                                      - (big) -> label:acc
                                                      - (med) -> [Doors]
                                                                   - (5more) -> label:acc
                                                                   - (4) -> label:acc
                                                                   - (2) -> label:unacc
                                                                   - (3) -> label:unacc
                          - (med) -> [MaintentancePrice]
                                       - (high) -> [Luggage]
                                                     - (small) -> label:unacc
                                                     - (big) -> label:acc
                                                     - (med) -> [Doors]
                                                                  - (5more) -> label:acc
                                                                  - (4) -> label:acc
                                                                  - (2) -> label:unacc
                                                                  - (3) -> label:unacc
                                       - (low) -> [Luggage]
                                                    - (small) -> label:acc
                                                    - (big) -> label:good
                                                    - (med) -> [Doors]
                                                                 - (5more) -> label:good
                                                                 - (4) -> label:good
                                                                 - (2) -> label:acc
                                                                 - (3) -> label:acc
                                       - (vhigh) -> [Luggage]
                                                      - (small) -> label:unacc
                                                      - (big) -> label:acc
                                                      - (med) -> [Doors]
                                                                   - (5more) -> label:acc
                                                                   - (4) -> label:acc
                                                                   - (2) -> label:unacc
                                                                   - (3) -> label:unacc
                                       - (med) -> label:acc
               - (2) -> label:unacc
               - (more) -> [BuyingPrice]
                             - (high) -> [Luggage]
                                           - (small) -> label:unacc
                                           - (big) -> [MaintentancePrice]
                                                        - (high) -> label:acc
                                                        - (low) -> label:acc
                                                        - (vhigh) -> label:unacc
                                                        - (med) -> label:acc
                                           - (med) -> [MaintentancePrice]
                                                        - (high) -> [Doors]
                                                                      - (5more) -> label:acc
                                                                      - (4) -> label:acc
                                                                      - (2) -> label:unacc
                                                                      - (3) -> label:acc
                                                        - (low) -> [Doors]
                                                                     - (5more) -> label:acc
                                                                     - (4) -> label:acc
                                                                     - (2) -> label:unacc
                                                                     - (3) -> label:acc
                                                        - (vhigh) -> label:unacc
                                                        - (med) -> [Doors]
                                                                     - (5more) -> label:acc
                                                                     - (4) -> label:acc
                                                                     - (2) -> label:unacc
                                                                     - (3) -> label:acc
                             - (low) -> [MaintentancePrice]
                                          - (high) -> [Doors]
                                                        - (5more) -> label:acc
                                                        - (4) -> label:acc
                                                        - (2) -> [Luggage]
                                                                   - (small) -> label:unacc
                                                                   - (big) -> label:acc
                                                                   - (med) -> label:acc
                                                        - (3) -> label:acc
                                          - (low) -> [Luggage]
                                                       - (small) -> [Doors]
                                                                      - (5more) -> label:acc
                                                                      - (4) -> label:acc
                                                                      - (2) -> label:unacc
                                                                      - (3) -> label:acc
                                                       - (big) -> label:good
                                                       - (med) -> [Doors]
                                                                    - (5more) -> label:good
                                                                    - (4) -> label:good
                                                                    - (2) -> label:acc
                                                                    - (3) -> label:good
                                          - (vhigh) -> [Luggage]
                                                         - (small) -> label:unacc
                                                         - (big) -> label:acc
                                                         - (med) -> [Doors]
                                                                      - (5more) -> label:acc
                                                                      - (4) -> label:acc
                                                                      - (2) -> label:unacc
                                                                      - (3) -> label:acc
                                          - (med) -> [Luggage]
                                                       - (small) -> [Doors]
                                                                      - (5more) -> label:acc
                                                                      - (4) -> label:acc
                                                                      - (2) -> label:unacc
                                                                      - (3) -> label:acc
                                                       - (big) -> label:good
                                                       - (med) -> [Doors]
                                                                    - (5more) -> label:good
                                                                    - (4) -> label:good
                                                                    - (2) -> label:acc
                                                                    - (3) -> label:good
                             - (vhigh) -> [MaintentancePrice]
                                            - (high) -> label:unacc
                                            - (low) -> [Luggage]
                                                         - (small) -> label:unacc
                                                         - (big) -> label:acc
                                                         - (med) -> [Doors]
                                                                      - (5more) -> label:acc
                                                                      - (4) -> label:acc
                                                                      - (2) -> label:unacc
                                                                      - (3) -> label:acc
                                            - (vhigh) -> label:unacc
                                            - (med) -> [Luggage]
                                                         - (small) -> label:unacc
                                                         - (big) -> label:acc
                                                         - (med) -> [Doors]
                                                                      - (5more) -> label:acc
                                                                      - (4) -> label:acc
                                                                      - (2) -> label:unacc
                                                                      - (3) -> label:acc
                             - (med) -> [MaintentancePrice]
                                          - (high) -> [Luggage]
                                                        - (small) -> label:unacc
                                                        - (big) -> label:acc
                                                        - (med) -> [Doors]
                                                                     - (5more) -> label:acc
                                                                     - (4) -> label:acc
                                                                     - (2) -> label:unacc
                                                                     - (3) -> label:acc
                                          - (low) -> [Luggage]
                                                       - (small) -> [Doors]
                                                                      - (5more) -> label:acc
                                                                      - (4) -> label:acc
                                                                      - (2) -> label:unacc
                                                                      - (3) -> label:acc
                                                       - (big) -> label:good
                                                       - (med) -> [Doors]
                                                                    - (5more) -> label:good
                                                                    - (4) -> label:good
                                                                    - (2) -> label:acc
                                                                    - (3) -> label:good
                                          - (vhigh) -> [Luggage]
                                                         - (small) -> label:unacc
                                                         - (big) -> label:acc
                                                         - (med) -> [Doors]
                                                                      - (5more) -> label:acc
                                                                      - (4) -> label:acc
                                                                      - (2) -> label:unacc
                                                                      - (3) -> label:acc
                                          - (med) -> [Doors]
                                                       - (5more) -> label:acc
                                                       - (4) -> label:acc
                                                       - (2) -> [Luggage]
                                                                  - (small) -> label:unacc
                                                                  - (big) -> label:acc
                                                                  - (med) -> label:acc
                                                       - (3) -> label:acc

In [22]:

misclass_rate(tree, cars_examples, 'Acceptable'), tree.size(), len(cars_examples)

Out[22]:

(0.0, 408, 1728)

In [23]:

cross_validate_id3(cars_examples, 'Acceptable', 3)

Fold 0: n_train=1152, n_test=576, misclass_rate=0.09
Fold 1: n_train=1152, n_test=576, misclass_rate=0.06
Fold 2: n_train=1152, n_test=576, misclass_rate=0.07

Out[23]:

0.07349537037037036

In [24]:

%matplotlib inline
import matplotlib.pyplot as plt


train = cars_examples.copy()
random.shuffle(train)

test = train[:500]
train = train[500:]


x = np.arange(10, 1200, 20)
ts = []
test_error = []
train_error = []

for i in x:
    t = id3(train[:i], None, 'Acceptable')
    ts.append(t.size())
    test_error.append(misclass_rate(t, test, 'Acceptable'))
    train_error.append(misclass_rate(t, train, 'Acceptable'))


plt.ylabel('Error rate')
plt.xlabel('Training set size')
plt.plot(x, train_error, label='train error')
plt.plot(x, test_error, label='test error')
plt.grid(True)
plt.legend();

In [ ]:

Big Mushrooms Dataset¶

In [25]:

mushrooms_columns = ['Edible?',
                     'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 
                     'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
                     'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
                     'stalk-color-above-ring', 'stalk-color-below-ring',
                     'veil-type', 'veil-color',
                     'ring-number', 'ring-type',
                     'spore-print-color',
                     'population', 'habitat']

mushrooms = pd.read_csv(urlopen("http://mlr.cs.umass.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"),
                   names=mushrooms_columns)

In [26]:

len(mushrooms_columns)

Out[26]:

In [27]:

mushrooms_examples = list(mushrooms[mushrooms_columns[:100]].T.to_dict().values())

In [28]:

t = id3(mushrooms_examples, None, 'Edible?')
misclass_rate(t, mushrooms_examples, 'Edible?'), t.size(), len(mushrooms_examples)

Out[28]:

(0.0, 27, 8124)

In [29]:

cross_validate_id3(mushrooms_examples, 'Edible?', 5)

Fold 0: n_train=6499, n_test=1625, misclass_rate=0.00
Fold 1: n_train=6499, n_test=1625, misclass_rate=0.00
Fold 2: n_train=6499, n_test=1625, misclass_rate=0.00
Fold 3: n_train=6499, n_test=1625, misclass_rate=0.00
Fold 4: n_train=6500, n_test=1624, misclass_rate=0.00

Out[29]:

0.0

Voting Data¶

In [30]:

voting_colums = ['party'] + ['vote%d' % i for i in range(16)]

voting = pd.read_csv(
    urlopen("https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data"),
    names=voting_colums,
    header=None
)

voting_examples = list(voting.T.to_dict().values())

In [31]:

t = id3(voting_examples, set(voting_colums).difference(['party']), 'party')

In [32]:

misclass_rate(t, voting_examples, 'party'), t.size(), len(voting_examples)

Out[32]:

(0.0, 60, 435)

In [33]:

cross_validate_id3(voting_examples, 'party', 5)

Fold 0: n_train=348, n_test=87, misclass_rate=0.05
Fold 1: n_train=348, n_test=87, misclass_rate=0.07
Fold 2: n_train=348, n_test=87, misclass_rate=0.05
Fold 3: n_train=348, n_test=87, misclass_rate=0.05
Fold 4: n_train=348, n_test=87, misclass_rate=0.10

Out[33]:

0.06206896551724138

Chess¶

In [34]:

chess_columns = ['x%d' % x for x in range(36)] + ['outcome']

chess = pd.read_csv(
    urlopen("https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king-pawn/kr-vs-kp.data"),
    names=chess_columns,
    header=None
)

chess_examples = list(chess.T.to_dict().values())

In [35]:

t = id3(chess_examples, set(chess_columns).difference(['outcome']), 'outcome')

In [36]:

misclass_rate(t, chess_examples, 'outcome'), t.size(), len(chess_examples)

Out[36]:

(0.0, 93, 3196)

In [37]:

cross_validate_id3(chess_examples, 'outcome', 5)

Fold 0: n_train=2556, n_test=640, misclass_rate=0.01
Fold 1: n_train=2557, n_test=639, misclass_rate=0.01
Fold 2: n_train=2557, n_test=639, misclass_rate=0.00
Fold 3: n_train=2557, n_test=639, misclass_rate=0.00
Fold 4: n_train=2557, n_test=639, misclass_rate=0.00

Out[37]:

0.005317879499217528

In [ ]: