"""Data sets for machine learning problems. (Chapters 18-21)."""

from __future__ import nested_scopes
import utils
from learning import *
import random

#______________________________________________________________________________

def RestaurantDataSet(examples):
    "Build a DataSet of Restaurant waiting examples."
    return DataSet(name='Restaurant', target='Wait', examples=examples,
                  attrnames='Alternate Bar Fri/Sat Hungry Patrons Price '
                   + 'Raining Reservation Type WaitEstimate Wait',
                  doc='Data from AIMA [Fig. 18.5]')


restaurant = RestaurantDataSet("""
Yes No  No  Yes Some $$$ No   Yes French  0-10   Yes
Yes No  No  Yes Full $   No   No  Thai    30-60  No
No  Yes No  No  Some $   No   No  Burger  0-10   Yes
Yes No  Yes Yes Full $   No   No  Thai    10-30  Yes
Yes No  Yes No  Full $$$ No   Yes French  >60    No 
No  Yes No  Yes Some $$  Yes  Yes Italian 0-10   Yes
No  Yes No  No  None $   Yes  No  Burger  0-10   No
No  No  No  Yes Some $$  Yes  Yes Thai    0-10   Yes
No  Yes Yes No  Full $   Yes  No  Burger  >60    No 
Yes Yes Yes Yes Full $$$ No   Yes Italian 10-30  No 
No  No  No  No  None $   No   No  Thai    0-10   No 
Yes Yes Yes Yes Full $   No   No  Burger  30-60  Yes""")

def SyntheticRestaurant(n=20):
    "Generate a DataSet with n examples."
    def T(attrname, branches):
        return DecisionTree(restaurant.attrnum(attrname), attrname, branches)
    tree = T('Patrons',
             {'None': 'No', 'Some': 'Yes', 'Full':
              T('WaitEstimate',
                {'>60': 'No', '0-10': 'Yes', '30-60':
                 T('Alternate', {'No':
                                 T('Reservation', {'Yes': 'Yes', 'No':
                                                   T('Bar', {'No':'No',
                                                             'Yes':'Yes'})}),
                                 'Yes':
                                 T('Fri/Sat', {'No': 'No', 'Yes': 'Yes'})}),
                 '10-30':
                 T('Hungry', {'No': 'Yes', 'Yes':
                           T('Alternate',
                             {'No': 'Yes', 'Yes':
                              T('Raining', {'No': 'No', 'Yes': 'Yes'})})})})})
    def gen():
        example =  map(random.choice, restaurant.values)
        example[restaurant.target] = tree.predict(example)
        return example
    return RestaurantDataSet([gen() for i in range(n)])


#______________________________________________________________________________

orings = DataSet(name='O-Rings',
attrnames="Rings Distressed Temp Pressure Flightnum", target='Distressed',
examples="""
6 0 66  50  1
6 1 70  50  2
6 0 69  50  3
6 0 68  50  4
6 0 67  50  5
6 0 72  50  6
6 0 73 100  7
6 0 70 100  8
6 1 57 200  9
6 1 63 200 10
6 1 70 200 11
6 0 78 200 12
6 0 67 200 13
6 2 53 200 14
6 0 67 200 15
6 0 75 200 16
6 0 70 200 17
6 0 81 200 18
6 0 76 200 19
6 0 79 200 20
6 0 75 200 21
6 0 76 200 22
6 1 58 200 23""",
source="http://www1.ics.uci.edu/pub/machine-learning-databases/space-shuttle/",
doc="""1. Title: Challenger Space Shuttle O-Ring Data (2 databases)

2. Sources:
   -- David Draper (draper@math.ucla.edu)
      University of California, Los Angeles
   -- Donor: David Draper (draper@math.ucla.edu)
   -- Date: 5 August 1993

3. Past Usage:

   1. Draper,~D. (1993).  Assessment and propagation of model uncertainty.  
      In {\it Proceedings of the Fourth International Workshop on Artificial
      Intelligence and Statistics} (pp. 497--509).  Ft. Lauderdale, FL:  
      Unpublished.
      -- Discrete model uncertainty analysis
      -- Analysis suggests that obvious different extrapolations of the
         data exist at 31 degrees Fahrenheit (i.e., freezing), which sharply
         discredits the assumption of no temperature effect.
   2. Dalal,~S.~R., Fowlkes,~E.~B., \& Hoadley,~B. (1989). Risk analysis of
      the space shuttle: pre-Challenger prediction of failure. {\it Journal
      of the American Statisticians Association}, {\it 84}, 945--957.
   3. Lavine,~M. (1991). Problems in extrapolation illustrated with space 
      shuttle O-ring data.  {\it Journal of the American Statisticians
      Association}, {\it 86}, 919--922.
   4. Martz~H.~F., \& Zimmer,~W.~J. (1992). The risk of catastrophic failure
      of the solid rocket boosters on the space shuttle.  {\it American
      Statistics}, {\it 46}, 42--47. 

4. Number of instances: 23 in each of two files

5. Relevant Information:

   There are two databases: (both use the same set of 5 attributes)
     1. Primary o-ring erosion and/or blowby
     2. Primary o-ring erosion only
   The two databases are identical except for the 2nd attribute of the
   21st instance (confirmed by David Draper on 8/5/93).

   Edited from (Draper, 1993):
      The motivation for collecting this database was the explosion of the
   USA Space Shuttle Challenger on 28 January, 1986.  An investigation
   ensued into the reliability of the shuttle's propulsion system.  The
   explosion was eventually traced to the failure of one of the three field 
   joints on one of the two solid booster rockets.  Each of these six field 
   joints includes two O-rings, designated as primary and secondary, which
   fail when phenomena called erosion and blowby both occur. 
      The night before the launch a decision had to be made regarding
   launch safety.  The discussion among engineers and managers leading to
   this decision included concern that the probability of failure of the
   O-rings depended on the temperature t at launch, which was forecase to
   be 31 degrees F. There are strong engineering reasons based on the
   composition of O-rings to support the judgment that failure
   probability may rise monotonically as temperature drops.  One other
   variable, the pressure s at which safety testing for field join leaks
   was performed, was available, but its relevance to the failure process
   was unclear.
       Draper's paper includes a menacing figure graphing the number of field
   joints experiencing stress vs. liftoff temperature for the 23 shuttle 
   flights previous to the Challenger disaster.  No previous liftoff 
   temperature was under 53 degrees F.  Although tremendous extrapolation
   must be done from the given data to assess risk at 31 degrees F, it
   is obvious even to the layman "to foresee the unacceptably high risk
   created by launching at 31 degrees F."  For more information, see
   Draper (1993) or the other previous analyses.
       The task is to predict the number of O-rings that will experience
   thermal distress for a given flight when the launch temperature is 
   below freezing.

6. Number of Attributes: 5
     1. Number of O-rings at risk on a given flight
     2. Number experiencing thermal distress
     3. Launch temperature (degrees F)
     4. Leak-check pressure (psi)
     5. Temporal order of flight

7. Attribute Information: all values are positive integers""")

#______________________________________________________________________________
       
zoo = DataSet(name='Zoo', target='type', exclude=['name'],
attrnames="""name hair feathers eggs milk airborne aquatic predator toothed backbone breathes venomous fins legs tail domestic catsize type""", 
examples="""
aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,mammal
antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,mammal
bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,fish
bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,mammal
boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
buffalo,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,mammal
calf,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,mammal
carp,0,0,1,0,0,1,0,1,1,0,0,1,0,1,1,0,fish
catfish,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,fish
cavy,1,0,0,1,0,0,0,1,1,1,0,0,4,0,1,0,mammal
cheetah,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
chicken,0,1,1,0,1,0,0,0,1,1,0,0,2,1,1,0,bird
chub,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,fish
clam,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,shellfish
crab,0,0,1,0,0,1,1,0,0,0,0,0,4,0,0,0,shellfish
crayfish,0,0,1,0,0,1,1,0,0,0,0,0,6,0,0,0,shellfish
crow,0,1,1,0,1,0,1,0,1,1,0,0,2,1,0,0,bird
deer,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,mammal
dogfish,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,1,fish
dolphin,0,0,0,1,0,1,1,1,1,1,0,1,0,1,0,1,mammal
dove,0,1,1,0,1,0,0,0,1,1,0,0,2,1,1,0,bird
duck,0,1,1,0,1,1,0,0,1,1,0,0,2,1,0,0,bird
elephant,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,mammal
flamingo,0,1,1,0,1,0,0,0,1,1,0,0,2,1,0,1,bird
flea,0,0,1,0,0,0,0,0,0,1,0,0,6,0,0,0,insect
frog,0,0,1,0,0,1,1,1,1,1,0,0,4,0,0,0,amphibian
frog,0,0,1,0,0,1,1,1,1,1,1,0,4,0,0,0,amphibian
fruitbat,1,0,0,1,1,0,0,1,1,1,0,0,2,1,0,0,mammal
giraffe,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,mammal
girl,1,0,0,1,0,0,1,1,1,1,0,0,2,0,1,1,mammal
gnat,0,0,1,0,1,0,0,0,0,1,0,0,6,0,0,0,insect
goat,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,mammal
gorilla,1,0,0,1,0,0,0,1,1,1,0,0,2,0,0,1,mammal
gull,0,1,1,0,1,1,1,0,1,1,0,0,2,1,0,0,bird
haddock,0,0,1,0,0,1,0,1,1,0,0,1,0,1,0,0,fish
hamster,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,0,mammal
hare,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,0,mammal
hawk,0,1,1,0,1,0,1,0,1,1,0,0,2,1,0,0,bird
herring,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,fish
honeybee,1,0,1,0,1,0,0,0,0,1,1,0,6,0,1,0,insect
housefly,1,0,1,0,1,0,0,0,0,1,0,0,6,0,0,0,insect
kiwi,0,1,1,0,0,0,1,0,1,1,0,0,2,1,0,0,bird
ladybird,0,0,1,0,1,0,1,0,0,1,0,0,6,0,0,0,insect
lark,0,1,1,0,1,0,0,0,1,1,0,0,2,1,0,0,bird
leopard,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
lion,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
lobster,0,0,1,0,0,1,1,0,0,0,0,0,6,0,0,0,shellfish
lynx,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
mink,1,0,0,1,0,1,1,1,1,1,0,0,4,1,0,1,mammal
mole,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,0,mammal
mongoose,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
moth,1,0,1,0,1,0,0,0,0,1,0,0,6,0,0,0,insect
newt,0,0,1,0,0,1,1,1,1,1,0,0,4,1,0,0,amphibian
octopus,0,0,1,0,0,1,1,0,0,0,0,0,8,0,0,1,shellfish
opossum,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,0,mammal
oryx,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,mammal
ostrich,0,1,1,0,0,0,0,0,1,1,0,0,2,1,0,1,bird
parakeet,0,1,1,0,1,0,0,0,1,1,0,0,2,1,1,0,bird
penguin,0,1,1,0,0,1,1,0,1,1,0,0,2,1,0,1,bird
pheasant,0,1,1,0,1,0,0,0,1,1,0,0,2,1,0,0,bird
pike,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,1,fish
piranha,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,fish
pitviper,0,0,1,0,0,0,1,1,1,1,1,0,0,1,0,0,reptile
platypus,1,0,1,1,0,1,1,0,1,1,0,0,4,1,0,1,mammal
polecat,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
pony,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,mammal
porpoise,0,0,0,1,0,1,1,1,1,1,0,1,0,1,0,1,mammal
puma,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
pussycat,1,0,0,1,0,0,1,1,1,1,0,0,4,1,1,1,mammal
raccoon,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
reindeer,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,mammal
rhea,0,1,1,0,0,0,1,0,1,1,0,0,2,1,0,1,bird
scorpion,0,0,0,0,0,0,1,0,0,1,1,0,8,1,0,0,shellfish
seahorse,0,0,1,0,0,1,0,1,1,0,0,1,0,1,0,0,fish
seal,1,0,0,1,0,1,1,1,1,1,0,1,0,0,0,1,mammal
sealion,1,0,0,1,0,1,1,1,1,1,0,1,2,1,0,1,mammal
seasnake,0,0,0,0,0,1,1,1,1,0,1,0,0,1,0,0,reptile
seawasp,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,shellfish
skimmer,0,1,1,0,1,1,1,0,1,1,0,0,2,1,0,0,bird
skua,0,1,1,0,1,1,1,0,1,1,0,0,2,1,0,0,bird
slowworm,0,0,1,0,0,0,1,1,1,1,0,0,0,1,0,0,reptile
slug,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,shellfish
sole,0,0,1,0,0,1,0,1,1,0,0,1,0,1,0,0,fish
sparrow,0,1,1,0,1,0,0,0,1,1,0,0,2,1,0,0,bird
squirrel,1,0,0,1,0,0,0,1,1,1,0,0,2,1,0,0,mammal
starfish,0,0,1,0,0,1,1,0,0,0,0,0,5,0,0,0,shellfish
stingray,0,0,1,0,0,1,1,1,1,0,1,1,0,1,0,1,fish
swan,0,1,1,0,1,1,0,0,1,1,0,0,2,1,0,1,bird
termite,0,0,1,0,0,0,0,0,0,1,0,0,6,0,0,0,insect
toad,0,0,1,0,0,1,0,1,1,1,0,0,4,0,0,0,amphibian
tortoise,0,0,1,0,0,0,0,0,1,1,0,0,4,1,0,1,reptile
tuatara,0,0,1,0,0,0,1,1,1,1,0,0,4,1,0,0,reptile
tuna,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,1,fish
vampire,1,0,0,1,1,0,0,1,1,1,0,0,2,1,0,0,mammal
vole,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,0,mammal
vulture,0,1,1,0,1,0,1,0,1,1,0,0,2,1,0,1,bird
wallaby,1,0,0,1,0,0,0,1,1,1,0,0,2,1,0,1,mammal
wasp,1,0,1,0,1,0,0,0,0,1,1,0,6,0,0,0,insect
wolf,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal
worm,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,shellfish
wren,0,1,1,0,1,0,0,0,1,1,0,0,2,1,0,0,bird
""",
source='http://www1.ics.uci.edu/pub/machine-learning-databases/zoo/',
doc="""
1. Title: Zoo database

2. Source Information
   -- Creator: Richard Forsyth
   -- Donor: Richard S. Forsyth 
             8 Grosvenor Avenue
             Mapperley Park
             Nottingham NG3 5DX
             0602-621676
   -- Date: 5/15/1990
 
3. Past Usage:
   -- None known other than what is shown in Forsyth's PC/BEAGLE User's Guide.

4. Relevant Information:
   -- A simple database containing 17 Boolean-valued attributes.  The "type"
      attribute appears to be the class attribute.  Here is a breakdown of
      which animals are in which type: (I find it unusual that there are
      2 instances of "frog" and one of "girl"!)

      Class# Set of animals:
      ====== ===============================================================
           1 (41) aardvark, antelope, bear, boar, buffalo, calf,
                  cavy, cheetah, deer, dolphin, elephant,
                  fruitbat, giraffe, girl, goat, gorilla, hamster,
                  hare, leopard, lion, lynx, mink, mole, mongoose,
                  opossum, oryx, platypus, polecat, pony,
                  porpoise, puma, pussycat, raccoon, reindeer,
                  seal, sealion, squirrel, vampire, vole, wallaby,wolf
           2 (20) chicken, crow, dove, duck, flamingo, gull, hawk,
                  kiwi, lark, ostrich, parakeet, penguin, pheasant,
                  rhea, skimmer, skua, sparrow, swan, vulture, wren
           3 (5)  pitviper, seasnake, slowworm, tortoise, tuatara 
           4 (13) bass, carp, catfish, chub, dogfish, haddock,
                  herring, pike, piranha, seahorse, sole, stingray, tuna
           5 (4)  frog, frog, newt, toad 
           6 (8)  flea, gnat, honeybee, housefly, ladybird, moth, termite, wasp
           7 (10) clam, crab, crayfish, lobster, octopus,
                  scorpion, seawasp, slug, starfish, worm

5. Number of Instances: 101

6. Number of Attributes: 18 (animal name, 15 Boolean attributes, 2 numerics)

7. Attribute Information: (name of attribute and type of value domain)
   1. animal name:      Unique for each instance
   2. hair              Boolean
   3. feathers          Boolean
   4. eggs              Boolean
   5. milk              Boolean
   6. airborne          Boolean
   7. aquatic           Boolean
   8. predator          Boolean
   9. toothed           Boolean
  10. backbone          Boolean
  11. breathes          Boolean
  12. venomous          Boolean
  13. fins              Boolean
  14. legs              Numeric (set of values: {0,2,4,5,6,8})
  15. tail              Boolean
  16. domestic          Boolean
  17. catsize           Boolean
  18. type              Numeric (integer values in range [1,7])

8. Missing Attribute Values: None

9. Class Distribution: Given above
   
""")
#______________________________________________________________________________

iris = DataSet(name="Iris",
    attrnames="sepal-len sepal-width petal-len petal-width class", target="class",
    examples="""5.1,3.5,1.4,0.2,setosa
4.9,3.0,1.4,0.2,setosa
4.7,3.2,1.3,0.2,setosa
4.6,3.1,1.5,0.2,setosa
5.0,3.6,1.4,0.2,setosa
5.4,3.9,1.7,0.4,setosa
4.6,3.4,1.4,0.3,setosa
5.0,3.4,1.5,0.2,setosa
4.4,2.9,1.4,0.2,setosa
4.9,3.1,1.5,0.1,setosa
5.4,3.7,1.5,0.2,setosa
4.8,3.4,1.6,0.2,setosa
4.8,3.0,1.4,0.1,setosa
4.3,3.0,1.1,0.1,setosa
5.8,4.0,1.2,0.2,setosa
5.7,4.4,1.5,0.4,setosa
5.4,3.9,1.3,0.4,setosa
5.1,3.5,1.4,0.3,setosa
5.7,3.8,1.7,0.3,setosa
5.1,3.8,1.5,0.3,setosa
5.4,3.4,1.7,0.2,setosa
5.1,3.7,1.5,0.4,setosa
4.6,3.6,1.0,0.2,setosa
5.1,3.3,1.7,0.5,setosa
4.8,3.4,1.9,0.2,setosa
5.0,3.0,1.6,0.2,setosa
5.0,3.4,1.6,0.4,setosa
5.2,3.5,1.5,0.2,setosa
5.2,3.4,1.4,0.2,setosa
4.7,3.2,1.6,0.2,setosa
4.8,3.1,1.6,0.2,setosa
5.4,3.4,1.5,0.4,setosa
5.2,4.1,1.5,0.1,setosa
5.5,4.2,1.4,0.2,setosa
4.9,3.1,1.5,0.1,setosa
5.0,3.2,1.2,0.2,setosa
5.5,3.5,1.3,0.2,setosa
4.9,3.1,1.5,0.1,setosa
4.4,3.0,1.3,0.2,setosa
5.1,3.4,1.5,0.2,setosa
5.0,3.5,1.3,0.3,setosa
4.5,2.3,1.3,0.3,setosa
4.4,3.2,1.3,0.2,setosa
5.0,3.5,1.6,0.6,setosa
5.1,3.8,1.9,0.4,setosa
4.8,3.0,1.4,0.3,setosa
5.1,3.8,1.6,0.2,setosa
4.6,3.2,1.4,0.2,setosa
5.3,3.7,1.5,0.2,setosa
5.0,3.3,1.4,0.2,setosa
7.0,3.2,4.7,1.4,versicolor
6.4,3.2,4.5,1.5,versicolor
6.9,3.1,4.9,1.5,versicolor
5.5,2.3,4.0,1.3,versicolor
6.5,2.8,4.6,1.5,versicolor
5.7,2.8,4.5,1.3,versicolor
6.3,3.3,4.7,1.6,versicolor
4.9,2.4,3.3,1.0,versicolor
6.6,2.9,4.6,1.3,versicolor
5.2,2.7,3.9,1.4,versicolor
5.0,2.0,3.5,1.0,versicolor
5.9,3.0,4.2,1.5,versicolor
6.0,2.2,4.0,1.0,versicolor
6.1,2.9,4.7,1.4,versicolor
5.6,2.9,3.6,1.3,versicolor
6.7,3.1,4.4,1.4,versicolor
5.6,3.0,4.5,1.5,versicolor
5.8,2.7,4.1,1.0,versicolor
6.2,2.2,4.5,1.5,versicolor
5.6,2.5,3.9,1.1,versicolor
5.9,3.2,4.8,1.8,versicolor
6.1,2.8,4.0,1.3,versicolor
6.3,2.5,4.9,1.5,versicolor
6.1,2.8,4.7,1.2,versicolor
6.4,2.9,4.3,1.3,versicolor
6.6,3.0,4.4,1.4,versicolor
6.8,2.8,4.8,1.4,versicolor
6.7,3.0,5.0,1.7,versicolor
6.0,2.9,4.5,1.5,versicolor
5.7,2.6,3.5,1.0,versicolor
5.5,2.4,3.8,1.1,versicolor
5.5,2.4,3.7,1.0,versicolor
5.8,2.7,3.9,1.2,versicolor
6.0,2.7,5.1,1.6,versicolor
5.4,3.0,4.5,1.5,versicolor
6.0,3.4,4.5,1.6,versicolor
6.7,3.1,4.7,1.5,versicolor
6.3,2.3,4.4,1.3,versicolor
5.6,3.0,4.1,1.3,versicolor
5.5,2.5,4.0,1.3,versicolor
5.5,2.6,4.4,1.2,versicolor
6.1,3.0,4.6,1.4,versicolor
5.8,2.6,4.0,1.2,versicolor
5.0,2.3,3.3,1.0,versicolor
5.6,2.7,4.2,1.3,versicolor
5.7,3.0,4.2,1.2,versicolor
5.7,2.9,4.2,1.3,versicolor
6.2,2.9,4.3,1.3,versicolor
5.1,2.5,3.0,1.1,versicolor
5.7,2.8,4.1,1.3,versicolor
6.3,3.3,6.0,2.5,virginica
5.8,2.7,5.1,1.9,virginica
7.1,3.0,5.9,2.1,virginica
6.3,2.9,5.6,1.8,virginica
6.5,3.0,5.8,2.2,virginica
7.6,3.0,6.6,2.1,virginica
4.9,2.5,4.5,1.7,virginica
7.3,2.9,6.3,1.8,virginica
6.7,2.5,5.8,1.8,virginica
7.2,3.6,6.1,2.5,virginica
6.5,3.2,5.1,2.0,virginica
6.4,2.7,5.3,1.9,virginica
6.8,3.0,5.5,2.1,virginica
5.7,2.5,5.0,2.0,virginica
5.8,2.8,5.1,2.4,virginica
6.4,3.2,5.3,2.3,virginica
6.5,3.0,5.5,1.8,virginica
7.7,3.8,6.7,2.2,virginica
7.7,2.6,6.9,2.3,virginica
6.0,2.2,5.0,1.5,virginica
6.9,3.2,5.7,2.3,virginica
5.6,2.8,4.9,2.0,virginica
7.7,2.8,6.7,2.0,virginica
6.3,2.7,4.9,1.8,virginica
6.7,3.3,5.7,2.1,virginica
7.2,3.2,6.0,1.8,virginica
6.2,2.8,4.8,1.8,virginica
6.1,3.0,4.9,1.8,virginica
6.4,2.8,5.6,2.1,virginica
7.2,3.0,5.8,1.6,virginica
7.4,2.8,6.1,1.9,virginica
7.9,3.8,6.4,2.0,virginica
6.4,2.8,5.6,2.2,virginica
6.3,2.8,5.1,1.5,virginica
6.1,2.6,5.6,1.4,virginica
7.7,3.0,6.1,2.3,virginica
6.3,3.4,5.6,2.4,virginica
6.4,3.1,5.5,1.8,virginica
6.0,3.0,4.8,1.8,virginica
6.9,3.1,5.4,2.1,virginica
6.7,3.1,5.6,2.4,virginica
6.9,3.1,5.1,2.3,virginica
5.8,2.7,5.1,1.9,virginica
6.8,3.2,5.9,2.3,virginica
6.7,3.3,5.7,2.5,virginica
6.7,3.0,5.2,2.3,virginica
6.3,2.5,5.0,1.9,virginica
6.5,3.0,5.2,2.0,virginica
6.2,3.4,5.4,2.3,virginica
5.9,3.0,5.1,1.8,virginica""",
doc="""1. Title: Iris Plants Database
        Updated Sept 21 by C.Blake - Added discrepency information

2. Sources:
     (a) Creator: R.A. Fisher
     (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
     (c) Date: July, 1988

3. Past Usage:
   - Publications: too many to mention!!!  Here are a few.
   1. Fisher,R.A. "The use of multiple measurements in taxonomic problems"
      Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions
      to Mathematical Statistics" (John Wiley, NY, 1950).
   2. Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.
      (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.
   3. Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
      Structure and Classification Rule for Recognition in Partially Exposed
      Environments".  IEEE Transactions on Pattern Analysis and Machine
      Intelligence, Vol. PAMI-2, No. 1, 67-71.
      -- Results:
         -- very low misclassification rates (0% for the setosa class)
   4. Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule".  IEEE 
      Transactions on Information Theory, May 1972, 431-433.
      -- Results:
         -- very low misclassification rates again
   5. See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al's AUTOCLASS II
      conceptual clustering system finds 3 classes in the data.

4. Relevant Information:
   --- This is perhaps the best known database to be found in the pattern
       recognition literature.  Fisher's paper is a classic in the field
       and is referenced frequently to this day.  (See Duda & Hart, for
       example.)  The data set contains 3 classes of 50 instances each,
       where each class refers to a type of iris plant.  One class is
       linearly separable from the other 2; the latter are NOT linearly
       separable from each other.
   --- Predicted attribute: class of iris plant.
   --- This is an exceedingly simple domain.
   --- This data differs from the data presented in Fishers article
        (identified by Steve Chadwick,  spchadwick@espeedaz.net )
        The 35th sample should be: 4.9,3.1,1.5,0.2,"Iris-setosa"
        where the error is in the fourth feature.
        The 38th sample: 4.9,3.6,1.4,0.1,"Iris-setosa"
        where the errors are in the second and third features.  

5. Number of Instances: 150 (50 in each of three classes)

6. Number of Attributes: 4 numeric, predictive attributes and the class

7. Attribute Information:
   1. sepal length in cm
   2. sepal width in cm
   3. petal length in cm
   4. petal width in cm
   5. class: 
      -- Iris Setosa
      -- Iris Versicolour
      -- Iris Virginica

8. Missing Attribute Values: None

Summary Statistics:
                 Min  Max   Mean    SD   Class Correlation
   sepal length: 4.3  7.9   5.84  0.83    0.7826   
    sepal width: 2.0  4.4   3.05  0.43   -0.4194
   petal length: 1.0  6.9   3.76  1.76    0.9490  (high!)
    petal width: 0.1  2.5   1.20  0.76    0.9565  (high!)

9. Class Distribution: 33.3% for each of 3 classes.""")

#______________________________________________________________________________
# Artificial, generated examples.

def Majority(k, n):
    """Return a DataSet with n k-bit examples of the majority problem:
    k random bits followed by a 1 if more than half the bits are 1, else 0."""
    examples = []
    for i in range(n):
        bits = [random.choice([0, 1]) for i in range(k)]
        bits.append(utils.sum(bits) > k/2)
        examples.append(bits)
    return DataSet(name="majority", examples=examples)

def Parity(k, n, name="parity"):
    """Return a DataSet with n k-bit examples of the parity problem:
    k random bits followed by a 1 if an odd number of bits are 1, else 0."""
    examples = []
    for i in range(n):
        bits = [random.choice([0, 1]) for i in range(k)]
        bits.append(utils.sum(bits) % 2)
        examples.append(bits)
    return DataSet(name=name, examples=examples)

def Xor(n):
    """Return a DataSet with n examples of 2-input xor."""
    return Parity(2, n, name="xor")

def ContinuousXor(n):
    "2 inputs are chosen uniformly form (0.0 .. 2.0]; output is xor of ints."
    examples = []
    for i in range(n):
        x, y = [random.uniform(0.0, 2.0) for i in '12']
        examples.append([x, y, int(x) != int(y)])
    return DataSet(name="continuous xor", examples=examples)

#______________________________________________________________________________

def compare(algorithms=[MajorityLearner, NaiveBayesLearner, 
                        NearestNeighborLearner, DecisionTreeLearner],
            datasets=[iris, orings, zoo, restaurant, SyntheticRestaurant(20),
                      Majority(7, 100), Parity(7, 100), Xor(100)],
            k=10, trials=1):
    """Compare various learners on various datasets using cross-validation.
    Print results as a table."""
    utils.print_table([[a.__name__.replace('Learner','')] +
                       [cross_validation(a(), d, k, trials) for d in datasets]
                       for a in algorithms],
                      header=[''] + [d.name[0:7] for d in datasets], round=2)
    
