mushrooms_df = pd.read_csv(StringIO("""
Color,Size,Points,Eatability
red,small,yes,toxic
brown,small,no,eatable
brown,large,yes,eatable
green,small,no,eatable
red,large,no,eatable
"""))
mushrooms_examples = list(mushrooms_df.T.to_dict().values())
mushrooms_examples
[{'Color': 'red', 'Eatability': 'toxic', 'Points': 'yes', 'Size': 'small'}, {'Color': 'brown', 'Eatability': 'eatable', 'Points': 'no', 'Size': 'small'}, {'Color': 'brown', 'Eatability': 'eatable', 'Points': 'yes', 'Size': 'large'}, {'Color': 'green', 'Eatability': 'eatable', 'Points': 'no', 'Size': 'small'}, {'Color': 'red', 'Eatability': 'eatable', 'Points': 'no', 'Size': 'large'}]
from collections import Counter
import math
def log2(x):
"""Base-2 logarithm with log2(0) == 0, as used for information entropy."""
if x != 0:
return math.log2(x)
return 0
def most_common_class(examples, target_attribute):
"""Returns the most common value of target_attribute among examples.
>>> most_common_class([{'c':1}, {'c': 2}, {'c': 1}], 'c')
1
"""
c = Counter(x[target_attribute] for x in examples)
return c.most_common(1)[0][0]
def priors(items):
c = Counter(items)
return {k:(v/sum(c.values())) for k, v in c.items()}
def posteriors(items):
"""Returns dictionary of posterior probabilities with d[(a,b)] = P(A=a|B=b).
Parameter "items" is a list of (a,b)-tuples.
>>> list(sorted(posteriors([(1,6), (2,6), (1,6), (3,6), (1,5), (2,5)]).items()))
[((1, 5), 0.5), ((1, 6), 0.5), ((2, 5), 0.5), ((2, 6), 0.25), ((3, 5), 0.0), ((3, 6), 0.25)]
"""
bvals = set(x[1] for x in items)
avals = set(x[0] for x in items)
probs = {}
for b in bvals:
a_counts = Counter(a for a, b1 in items if b1 == b)
s = sum(a_counts.values())
for a in avals:
probs[(a, b)] = a_counts.get(a, 0) / s
return probs
def conditional_entropy(examples, attribute, target):
"""Compute the conditional entropy for _target_ given _attribute_ over _examples.
>>> conditional_entropy(mushrooms_examples, 'Color', 'Eatability')
0.4
"""
p_att = priors(x[attribute] for x in examples)
p_cond = posteriors([(x[target], x[attribute]) for x in examples])
target_vals = set(x[target] for x in examples)
attribute_vals = set(x[attribute] for x in examples)
H_cond = {b:-sum(p_cond[(a,b)]*log2(p_cond[(a,b)]) for a in target_vals)
for b in attribute_vals}
return sum(p_att[b] * H_cond[b] for b in attribute_vals)
def information_gain(examples, attribute, target):
cls = [x[target] for x in examples]
att = [x[attribute] for x in examples]
classes = set(cls)
## prior entropy
p = priors(cls)
H_class = -sum([p[c]*log2(p[c]) for c in classes])
## conditional entropy
H_class_given_att = conditional_entropy(att, cls)
return H_class - H_class_given_att
doctest.testmod()
TestResults(failed=0, attempted=3)
class ID3Node(object):
def __init__(self):
self.children = {}
self.attribute = None
self.label = None
def create_edge(self, att_val, child):
self.children[att_val] = child
def classify(self, example):
if not self.children:
return self.label
v = example[self.attribute]
if v not in self.children:
return self.label
return self.children[v].classify(example)
def size(self):
return 1 + sum([c.size() for c in self.children.values()])
def values(examples, attribute):
return set(x[attribute] for x in examples)
def id3(examples, attributes, target):
if attributes is None:
attributes = set(examples[0].keys()).difference([target])
t = ID3Node()
mcc = most_common_class(examples, target)
t.label = mcc
if all(x[target] == mcc for x in examples) or len(attributes) == 0:
return t
a_star = min(attributes, key=lambda a: conditional_entropy(examples, a, target))
t.attribute = a_star
for a in values(examples, a_star):
D_a = [x for x in examples if x[a_star] == a]
if len(D_a) == 0:
t1 = ID3Node()
t1.label = mcc
t1.attribute = ''
t.create_edge(a, t1)
else:
t.create_edge(a, id3(D_a, attributes.difference(a_star), target))
return t
def print_tree(t, prefix=""):
if t.attribute:
print('[{}]'.format(t.attribute, t.label))
else:
print('label:{}'.format(t.label))
for val, c in t.children.items():
print(prefix, end="")
valstr = " - ({}) -> ".format(val)
print(valstr, end="")
print_tree(c, prefix=prefix+(" "*len(valstr)))
t = id3(mushrooms_examples, {'Color', 'Size', 'Points'}, 'Eatability')
print_tree(t)
[Points] - (no) -> label:eatable - (yes) -> [Size] - (small) -> label:toxic - (large) -> label:eatable
def misclass_rate(tree, examples, target):
n_right = 0
n_wrong = 0
for x in examples:
pred = tree.classify(x)
if pred == x[target]:
n_right += 1
else:
n_wrong += 1
return (n_wrong) / (n_right + n_wrong)
import random
def cross_validate_id3(examples, target, k, seed=0):
random.seed(seed)
xs = list(examples) ## create copy to not modify original list
random.shuffle(xs)
folds = []
foldsize = len(xs) // k ## integer division
for i in range(k):
folds.append(xs[:foldsize])
del xs[:foldsize]
for i, x in enumerate(xs): ## len not evenly divisible by k
folds[i % len(folds)].append(x)
attributes = set(examples[0].keys()).difference([target])
misclass_sum = 0
for i in range(k):
train = [x for j in range(k) if j != i for x in folds[j] ]
test = folds[i]
tree = id3(train, attributes, target)
misclass = misclass_rate(tree, test, target)
print('Fold {}: n_train={}, n_test={}, misclass_rate={:.2f}'.format(
i, len(train), len(test), misclass))
misclass_sum += misclass
return misclass_sum / k
misclass_rate(t, mushrooms_examples, 'Eatability')
0.0
avg = cross_validate_id3(dogs_examples, 'Class', 6)
print("Average misclassification rate: {:.2f}".format(avg))
Fold 0: n_train=5, n_test=2, misclass_rate=0.50 Fold 1: n_train=6, n_test=1, misclass_rate=0.00 Fold 2: n_train=6, n_test=1, misclass_rate=1.00 Fold 3: n_train=6, n_test=1, misclass_rate=0.00 Fold 4: n_train=6, n_test=1, misclass_rate=1.00 Fold 5: n_train=6, n_test=1, misclass_rate=1.00 Average misclassification rate: 0.58
def pruning(tree, branch, examples, examples_sub, target):
if branch.attribute: # a sub-tree, pruning if better
print('a sub tree, key is', branch.attribute)
old_branch = copy.copy(branch) # backup
old_tree = copy.copy(tree) # backup
old_err = misclass_rate(branch, examples, target)
mcc = most_common_class(examples_sub, target)
branch.label = mcc
branch.attribute = ''
branch.children = {}
new_err = misclass_rate(branch, examples, target)
print(new_err, old_err)
if old_err <= new_err: # old is better, keep the old tree and try pruning its' children
print('keep it!')
branch = old_branch
tree = old_tree
print_tree(tree)
a_star = branch.attribute
for val, tt in enumerate(branch.children.items():
ex = [x for x in examples_sub if x[a_star] == val]
print('check',val)
tree = pruning(tree, tt, examples, ex , target)
return tree
else: # new is better, pruning and return
print('pruning it!')
print_tree(tree)
return tree
else: # a leaf, return
print('a leaf, label is', branch.label)
print_tree(tree)
return tree
dogs_df = pd.read_table(StringIO("""
Color\tFur\tSize\tClass
brown\tragged\tsmall\twell-behaved
black\tragged\tbig\tdangerous
black\tsmooth\tbig\tdangerous
black\tcurly\tsmall\twell-behaved
white\tcurly\tsmall\twell-behaved
white\tcurly\tbig\tdangerous
white\tsmooth\tsmall\tdangerous
red\tragged\tbig\twell-behaved"""))
dogs_examples = list(dogs_df.T.to_dict().values())
t = id3(dogs_examples, {'Color', 'Fur', 'Size'}, 'Class')
print_tree(t)
[Fur] - (curly) -> [Size] - (small) -> label:well-behaved - (big) -> label:dangerous - (ragged) -> [Color] - (black) -> label:dangerous - (red) -> label:well-behaved - (brown) -> label:well-behaved - (smooth) -> label:dangerous
dogs_df_dev = pd.read_table(StringIO("""
Color\tFur\tSize\tClass
brown\tragged\tsmall\twell-behaved
black\tragged\tbig\twell-behaved
black\tsmooth\tbig\tdangerous
black\tcurly\tsmall\twell-behaved
white\tcurly\tsmall\twell-behaved
white\tcurly\tbig\twell-behaved
white\tsmooth\tsmall\tdangerous
red\tragged\tbig\twell-behaved"""))
dogs_examples_dev = list(dogs_df_dev.T.to_dict().values())
dogs_examples_dev
pruning_branch = t
pruning_ex = dogs_examples_dev
new_t = pruning(t, pruning_branch, dogs_examples_dev, pruning_ex, 'Class')
print('final')
print_tree(new_t)
a sub tree, key is Fur 0.25 0.25 keep it! [Fur] - (curly) -> [Size] - (small) -> label:well-behaved - (big) -> label:dangerous - (ragged) -> [Color] - (black) -> label:dangerous - (red) -> label:well-behaved - (brown) -> label:well-behaved - (smooth) -> label:dangerous check curly a sub tree, key is Size 0.25 0.5 pruning it! [Fur] - (curly) -> label:well-behaved - (ragged) -> [Color] - (black) -> label:dangerous - (red) -> label:well-behaved - (brown) -> label:well-behaved - (smooth) -> label:dangerous check ragged a sub tree, key is Color 0.25 0.375 pruning it! [Fur] - (curly) -> label:well-behaved - (ragged) -> label:well-behaved - (smooth) -> label:dangerous check smooth a leaf, label is dangerous [Fur] - (curly) -> label:well-behaved - (ragged) -> label:well-behaved - (smooth) -> label:dangerous final [Fur] - (curly) -> label:well-behaved - (ragged) -> label:well-behaved - (smooth) -> label:dangerous
from urllib.request import urlopen
cars_columns = ['BuyingPrice', 'MaintentancePrice', 'Doors', 'Persons', 'Luggage', 'Safety', 'Acceptable']
cars = pd.read_csv(urlopen("http://mlr.cs.umass.edu/ml/machine-learning-databases/car/car.data"),
names=cars_columns)
cars_examples = list(cars.T.to_dict().values())
tree = id3(cars_examples, set(cars_columns).difference(['Acceptable']), 'Acceptable')
print_tree(tree)
[Safety] - (high) -> [Persons] - (4) -> [BuyingPrice] - (high) -> [MaintentancePrice] - (high) -> label:acc - (low) -> label:acc - (vhigh) -> label:unacc - (med) -> label:acc - (low) -> [MaintentancePrice] - (high) -> [Luggage] - (small) -> label:acc - (big) -> label:vgood - (med) -> [Doors] - (5more) -> label:vgood - (4) -> label:vgood - (2) -> label:acc - (3) -> label:acc - (low) -> [Luggage] - (small) -> label:good - (big) -> label:vgood - (med) -> [Doors] - (5more) -> label:vgood - (4) -> label:vgood - (2) -> label:good - (3) -> label:good - (vhigh) -> label:acc - (med) -> [Luggage] - (small) -> label:good - (big) -> label:vgood - (med) -> [Doors] - (5more) -> label:vgood - (4) -> label:vgood - (2) -> label:good - (3) -> label:good - (vhigh) -> [MaintentancePrice] - (high) -> label:unacc - (low) -> label:acc - (vhigh) -> label:unacc - (med) -> label:acc - (med) -> [MaintentancePrice] - (high) -> label:acc - (low) -> [Luggage] - (small) -> label:good - (big) -> label:vgood - (med) -> [Doors] - (5more) -> label:vgood - (4) -> label:vgood - (2) -> label:good - (3) -> label:good - (vhigh) -> label:acc - (med) -> [Luggage] - (small) -> label:acc - (big) -> label:vgood - (med) -> [Doors] - (5more) -> label:vgood - (4) -> label:vgood - (2) -> label:acc - (3) -> label:acc - (2) -> label:unacc - (more) -> [BuyingPrice] - (high) -> [MaintentancePrice] - (high) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> label:acc - (3) -> label:acc - (low) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> label:acc - (3) -> label:acc - (vhigh) -> label:unacc - (med) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> label:acc - (3) -> label:acc - (low) -> [MaintentancePrice] - (high) -> [Luggage] - (small) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:acc - (big) -> label:vgood - (med) -> [Doors] - (5more) -> label:vgood - (4) -> label:vgood - (2) -> label:acc - (3) -> label:vgood - (low) -> [Luggage] - (small) -> [Doors] - (5more) -> label:good - (4) -> label:good - (2) -> label:unacc - (3) -> label:good - (big) -> label:vgood - (med) -> [Doors] - (5more) -> label:vgood - (4) -> label:vgood - (2) -> label:good - (3) -> label:vgood - (vhigh) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> label:acc - (3) -> label:acc - (med) -> [Luggage] - (small) -> [Doors] - (5more) -> label:good - (4) -> label:good - (2) -> label:unacc - (3) -> label:good - (big) -> label:vgood - (med) -> [Doors] - (5more) -> label:vgood - (4) -> label:vgood - (2) -> label:good - (3) -> label:vgood - (vhigh) -> [MaintentancePrice] - (high) -> label:unacc - (low) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> label:acc - (3) -> label:acc - (vhigh) -> label:unacc - (med) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> label:acc - (3) -> label:acc - (med) -> [MaintentancePrice] - (high) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> label:acc - (3) -> label:acc - (low) -> [Luggage] - (small) -> [Doors] - (5more) -> label:good - (4) -> label:good - (2) -> label:unacc - (3) -> label:good - (big) -> label:vgood - (med) -> [Doors] - (5more) -> label:vgood - (4) -> label:vgood - (2) -> label:good - (3) -> label:vgood - (vhigh) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> label:acc - (3) -> label:acc - (med) -> [Luggage] - (small) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:acc - (big) -> label:vgood - (med) -> [Doors] - (5more) -> label:vgood - (4) -> label:vgood - (2) -> label:acc - (3) -> label:vgood - (low) -> label:unacc - (med) -> [Persons] - (4) -> [BuyingPrice] - (high) -> [Luggage] - (small) -> label:unacc - (big) -> [MaintentancePrice] - (high) -> label:acc - (low) -> label:acc - (vhigh) -> label:unacc - (med) -> label:acc - (med) -> [Doors] - (5more) -> [MaintentancePrice] - (high) -> label:acc - (low) -> label:acc - (vhigh) -> label:unacc - (med) -> label:acc - (4) -> [MaintentancePrice] - (high) -> label:acc - (low) -> label:acc - (vhigh) -> label:unacc - (med) -> label:acc - (2) -> label:unacc - (3) -> label:unacc - (low) -> [MaintentancePrice] - (high) -> label:acc - (low) -> [Luggage] - (small) -> label:acc - (big) -> label:good - (med) -> [Doors] - (5more) -> label:good - (4) -> label:good - (2) -> label:acc - (3) -> label:acc - (vhigh) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:unacc - (med) -> [Luggage] - (small) -> label:acc - (big) -> label:good - (med) -> [Doors] - (5more) -> label:good - (4) -> label:good - (2) -> label:acc - (3) -> label:acc - (vhigh) -> [MaintentancePrice] - (high) -> label:unacc - (low) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:unacc - (vhigh) -> label:unacc - (med) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:unacc - (med) -> [MaintentancePrice] - (high) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:unacc - (low) -> [Luggage] - (small) -> label:acc - (big) -> label:good - (med) -> [Doors] - (5more) -> label:good - (4) -> label:good - (2) -> label:acc - (3) -> label:acc - (vhigh) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:unacc - (med) -> label:acc - (2) -> label:unacc - (more) -> [BuyingPrice] - (high) -> [Luggage] - (small) -> label:unacc - (big) -> [MaintentancePrice] - (high) -> label:acc - (low) -> label:acc - (vhigh) -> label:unacc - (med) -> label:acc - (med) -> [MaintentancePrice] - (high) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:acc - (low) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:acc - (vhigh) -> label:unacc - (med) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:acc - (low) -> [MaintentancePrice] - (high) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> label:acc - (3) -> label:acc - (low) -> [Luggage] - (small) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:acc - (big) -> label:good - (med) -> [Doors] - (5more) -> label:good - (4) -> label:good - (2) -> label:acc - (3) -> label:good - (vhigh) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:acc - (med) -> [Luggage] - (small) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:acc - (big) -> label:good - (med) -> [Doors] - (5more) -> label:good - (4) -> label:good - (2) -> label:acc - (3) -> label:good - (vhigh) -> [MaintentancePrice] - (high) -> label:unacc - (low) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:acc - (vhigh) -> label:unacc - (med) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:acc - (med) -> [MaintentancePrice] - (high) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:acc - (low) -> [Luggage] - (small) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:acc - (big) -> label:good - (med) -> [Doors] - (5more) -> label:good - (4) -> label:good - (2) -> label:acc - (3) -> label:good - (vhigh) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> label:unacc - (3) -> label:acc - (med) -> [Doors] - (5more) -> label:acc - (4) -> label:acc - (2) -> [Luggage] - (small) -> label:unacc - (big) -> label:acc - (med) -> label:acc - (3) -> label:acc
misclass_rate(tree, cars_examples, 'Acceptable'), tree.size(), len(cars_examples)
(0.0, 408, 1728)
cross_validate_id3(cars_examples, 'Acceptable', 3)
Fold 0: n_train=1152, n_test=576, misclass_rate=0.09 Fold 1: n_train=1152, n_test=576, misclass_rate=0.06 Fold 2: n_train=1152, n_test=576, misclass_rate=0.07
0.07349537037037036
%matplotlib inline
import matplotlib.pyplot as plt
train = cars_examples.copy()
random.shuffle(train)
test = train[:500]
train = train[500:]
x = np.arange(10, 1200, 20)
ts = []
test_error = []
train_error = []
for i in x:
t = id3(train[:i], None, 'Acceptable')
ts.append(t.size())
test_error.append(misclass_rate(t, test, 'Acceptable'))
train_error.append(misclass_rate(t, train, 'Acceptable'))
plt.ylabel('Error rate')
plt.xlabel('Training set size')
plt.plot(x, train_error, label='train error')
plt.plot(x, test_error, label='test error')
plt.grid(True)
plt.legend();
mushrooms_columns = ['Edible?',
'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
'stalk-color-above-ring', 'stalk-color-below-ring',
'veil-type', 'veil-color',
'ring-number', 'ring-type',
'spore-print-color',
'population', 'habitat']
mushrooms = pd.read_csv(urlopen("http://mlr.cs.umass.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"),
names=mushrooms_columns)
len(mushrooms_columns)
23
mushrooms_examples = list(mushrooms[mushrooms_columns[:100]].T.to_dict().values())
t = id3(mushrooms_examples, None, 'Edible?')
misclass_rate(t, mushrooms_examples, 'Edible?'), t.size(), len(mushrooms_examples)
(0.0, 27, 8124)
cross_validate_id3(mushrooms_examples, 'Edible?', 5)
Fold 0: n_train=6499, n_test=1625, misclass_rate=0.00 Fold 1: n_train=6499, n_test=1625, misclass_rate=0.00 Fold 2: n_train=6499, n_test=1625, misclass_rate=0.00 Fold 3: n_train=6499, n_test=1625, misclass_rate=0.00 Fold 4: n_train=6500, n_test=1624, misclass_rate=0.00
0.0
voting_colums = ['party'] + ['vote%d' % i for i in range(16)]
voting = pd.read_csv(
urlopen("https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data"),
names=voting_colums,
header=None
)
voting_examples = list(voting.T.to_dict().values())
t = id3(voting_examples, set(voting_colums).difference(['party']), 'party')
misclass_rate(t, voting_examples, 'party'), t.size(), len(voting_examples)
(0.0, 60, 435)
cross_validate_id3(voting_examples, 'party', 5)
Fold 0: n_train=348, n_test=87, misclass_rate=0.05 Fold 1: n_train=348, n_test=87, misclass_rate=0.07 Fold 2: n_train=348, n_test=87, misclass_rate=0.05 Fold 3: n_train=348, n_test=87, misclass_rate=0.05 Fold 4: n_train=348, n_test=87, misclass_rate=0.10
0.06206896551724138
chess_columns = ['x%d' % x for x in range(36)] + ['outcome']
chess = pd.read_csv(
urlopen("https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king-pawn/kr-vs-kp.data"),
names=chess_columns,
header=None
)
chess_examples = list(chess.T.to_dict().values())
t = id3(chess_examples, set(chess_columns).difference(['outcome']), 'outcome')
misclass_rate(t, chess_examples, 'outcome'), t.size(), len(chess_examples)
(0.0, 93, 3196)
cross_validate_id3(chess_examples, 'outcome', 5)
Fold 0: n_train=2556, n_test=640, misclass_rate=0.01 Fold 1: n_train=2557, n_test=639, misclass_rate=0.01 Fold 2: n_train=2557, n_test=639, misclass_rate=0.00 Fold 3: n_train=2557, n_test=639, misclass_rate=0.00 Fold 4: n_train=2557, n_test=639, misclass_rate=0.00
0.005317879499217528