#===================Pre-amble and explanation=============================================

# This script investigates ways to solve the issue we ended with on Monday:
# How do we create a new categorical variable to break the income variable into 
# groups - low, mid and high?

# The rule is low is less than 25k
#             mid is greater than 25k and less than 35k
#             high is 35k and greater

# The way that many R programmers would approach this is demonstrated first, then
# I show a more explicit way using the "ifelse" construct.  The issue is
# creating the right intervals

# The results of these are compared by displaying the results side by side with income.

# You should pay attention to *case number 75* (meddf[75]) to see what the difference is.

# I also mention an alternative to cut which is included in a package Hmisc which
# you could investigate.  It has more flexibility than cut but not by much...

# In all, in a case where we have one condition that is > or < and another that is
# >= or <=, neither cut is very easy to use or flexible and using _ifelse_
# is perhaps more straightforward.


# ====================beginning of examples proper========================
# read in the data from the remove csv vile
# this script produces some output for you to examine
# if you run it all in one go you will want to scroll back
# through the output and read it in conjunction with the script
# to understand it

meddf <- read.csv("http://www.ucl.ac.uk/~ccaajim/medtrial.csv")


# first, I'm going to try with cut()
# cut() excludes the lower value in an interval and includes the higher.
# if you use breaks c(0,10,20,30) then the intevals are:

# >0 to <=10, >10 to <=20, >20 to <=30

# try the next code line first to help see that.
# it takes the series of integers 1 to 31 and cuts them at
# the breakpoints specified and displays the results as level 1, 2 or three.
# why are 0 and 31 and NA?

x<-as.numeric(cut(1:31,c(0,10,20,30)))

# the **as.numeric** in the line above just forces r to print simple level numbers rather
# than *intervals* - it's cosmetic

# here is an attempt to cut income into a variable incg.1
# the dig.lab option sets to display numers up to ten digits
# rather than scientific notation

meddf$incg.1 <- cut(meddf$income,c(0,25000,35000,100000),dig.lab = 10)

# the next line displays the result of the cut against income for checking

print(cbind(meddf$income, meddf$incg.1))
##        [,1] [,2]
##  [1,] 46000    3
##  [2,] 58000    3
##  [3,] 47000    3
##  [4,] 55000    3
##  [5,] 28000    2
##  [6,] 28000    2
##  [7,] 61000    3
##  [8,] 39000    3
##  [9,]    NA   NA
## [10,] 12000    1
## [11,] 39000    3
## [12,] 38000    3
## [13,] 30000    2
## [14,] 48000    3
## [15,] 36000    3
## [16,] 47000    3
## [17,] 43000    3
## [18,] 54000    3
## [19,] 49000    3
## [20,] 36000    3
## [21,] 58000    3
## [22,] 46000    3
## [23,] 47000    3
## [24,] 44000    3
## [25,] 45000    3
## [26,] 26000    2
## [27,] 44000    3
## [28,] 52000    3
## [29,] 48000    3
## [30,] 61000    3
## [31,] 38000    3
## [32,] 17000    1
## [33,] 47000    3
## [34,] 54000    3
## [35,] 54000    3
## [36,]    NA   NA
## [37,] 26000    2
## [38,] 57000    3
## [39,] 42000    3
## [40,] 33000    2
## [41,] 29000    2
## [42,] 46000    3
## [43,] 50000    3
## [44,] 49000    3
## [45,] 40000    3
## [46,] 46000    3
## [47,] 15000    1
## [48,] 61000    3
## [49,] 32000    2
## [50,] 47000    3
## [51,] 45000    3
## [52,] 28000    2
## [53,] 47000    3
## [54,] 55000    3
## [55,] 66000    3
## [56,] 44000    3
## [57,] 34000    2
## [58,] 53000    3
## [59,] 48000    3
## [60,] 34000    2
## [61,] 40000    3
## [62,] 49000    3
## [63,] 51000    3
## [64,] 45000    3
## [65,] 39000    3
## [66,] 44000    3
## [67,] 49000    3
## [68,] 49000    3
## [69,] 46000    3
## [70,] 53000    3
## [71,] 33000    2
## [72,] 51000    3
## [73,] 55000    3
## [74,] 23000    1
## [75,] 35000    2
## [76,] 35000    2
## [77,] 23000    1
## [78,] 52000    3
## [79,] 65000    3
## [80,] 50000    3
## [81,] 40000    3
## [82,] 42000    3
## [83,] 40000    3
## [84,] 63000    3
## [85,] 37000    3
## [86,] 47000    3
# **Writing the rule by hand**
#
# this version uses a *control* structure command called *ifelse
# the rule is like this:
#
# ifelse(variablevalue meets condition, outcome, 
#   ifelse(variablevalue meets secondcondition, secondoutcome,
#     defaultoutcome))

# here is the code corresponding to the last comment:
# to run it you must highlight all three lines

meddf$incg.2 <- ifelse (meddf$income >= 35000, 3,
                  ifelse (meddf$income < 25000,1,
                          2))

# And now these lines just display the different results
# lined up so that  you can check for differences

print(cbind(meddf$income, meddf$incg.2))
##        [,1] [,2]
##  [1,] 46000    3
##  [2,] 58000    3
##  [3,] 47000    3
##  [4,] 55000    3
##  [5,] 28000    2
##  [6,] 28000    2
##  [7,] 61000    3
##  [8,] 39000    3
##  [9,]    NA   NA
## [10,] 12000    1
## [11,] 39000    3
## [12,] 38000    3
## [13,] 30000    2
## [14,] 48000    3
## [15,] 36000    3
## [16,] 47000    3
## [17,] 43000    3
## [18,] 54000    3
## [19,] 49000    3
## [20,] 36000    3
## [21,] 58000    3
## [22,] 46000    3
## [23,] 47000    3
## [24,] 44000    3
## [25,] 45000    3
## [26,] 26000    2
## [27,] 44000    3
## [28,] 52000    3
## [29,] 48000    3
## [30,] 61000    3
## [31,] 38000    3
## [32,] 17000    1
## [33,] 47000    3
## [34,] 54000    3
## [35,] 54000    3
## [36,]    NA   NA
## [37,] 26000    2
## [38,] 57000    3
## [39,] 42000    3
## [40,] 33000    2
## [41,] 29000    2
## [42,] 46000    3
## [43,] 50000    3
## [44,] 49000    3
## [45,] 40000    3
## [46,] 46000    3
## [47,] 15000    1
## [48,] 61000    3
## [49,] 32000    2
## [50,] 47000    3
## [51,] 45000    3
## [52,] 28000    2
## [53,] 47000    3
## [54,] 55000    3
## [55,] 66000    3
## [56,] 44000    3
## [57,] 34000    2
## [58,] 53000    3
## [59,] 48000    3
## [60,] 34000    2
## [61,] 40000    3
## [62,] 49000    3
## [63,] 51000    3
## [64,] 45000    3
## [65,] 39000    3
## [66,] 44000    3
## [67,] 49000    3
## [68,] 49000    3
## [69,] 46000    3
## [70,] 53000    3
## [71,] 33000    2
## [72,] 51000    3
## [73,] 55000    3
## [74,] 23000    1
## [75,] 35000    3
## [76,] 35000    3
## [77,] 23000    1
## [78,] 52000    3
## [79,] 65000    3
## [80,] 50000    3
## [81,] 40000    3
## [82,] 42000    3
## [83,] 40000    3
## [84,] 63000    3
## [85,] 37000    3
## [86,] 47000    3
print(cbind(meddf$incg.1,meddf$incg.2))
##       [,1] [,2]
##  [1,]    3    3
##  [2,]    3    3
##  [3,]    3    3
##  [4,]    3    3
##  [5,]    2    2
##  [6,]    2    2
##  [7,]    3    3
##  [8,]    3    3
##  [9,]   NA   NA
## [10,]    1    1
## [11,]    3    3
## [12,]    3    3
## [13,]    2    2
## [14,]    3    3
## [15,]    3    3
## [16,]    3    3
## [17,]    3    3
## [18,]    3    3
## [19,]    3    3
## [20,]    3    3
## [21,]    3    3
## [22,]    3    3
## [23,]    3    3
## [24,]    3    3
## [25,]    3    3
## [26,]    2    2
## [27,]    3    3
## [28,]    3    3
## [29,]    3    3
## [30,]    3    3
## [31,]    3    3
## [32,]    1    1
## [33,]    3    3
## [34,]    3    3
## [35,]    3    3
## [36,]   NA   NA
## [37,]    2    2
## [38,]    3    3
## [39,]    3    3
## [40,]    2    2
## [41,]    2    2
## [42,]    3    3
## [43,]    3    3
## [44,]    3    3
## [45,]    3    3
## [46,]    3    3
## [47,]    1    1
## [48,]    3    3
## [49,]    2    2
## [50,]    3    3
## [51,]    3    3
## [52,]    2    2
## [53,]    3    3
## [54,]    3    3
## [55,]    3    3
## [56,]    3    3
## [57,]    2    2
## [58,]    3    3
## [59,]    3    3
## [60,]    2    2
## [61,]    3    3
## [62,]    3    3
## [63,]    3    3
## [64,]    3    3
## [65,]    3    3
## [66,]    3    3
## [67,]    3    3
## [68,]    3    3
## [69,]    3    3
## [70,]    3    3
## [71,]    2    2
## [72,]    3    3
## [73,]    3    3
## [74,]    1    1
## [75,]    2    3
## [76,]    2    3
## [77,]    1    1
## [78,]    3    3
## [79,]    3    3
## [80,]    3    3
## [81,]    3    3
## [82,]    3    3
## [83,]    3    3
## [84,]    3    3
## [85,]    3    3
## [86,]    3    3
print(as.numeric(meddf$incg.1)!=meddf$incg.2)
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE    NA FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE    NA FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE
## [78] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# Did these code lines do what you expected?

# cut() is rather inflexible with its treatment of the limits of intervals.

#**Optional research exercise**
# Read help on cut - is there a solution?  Read about cut2 from package hmisc: would this help?

# install the package Hmisc by uncommenting the next two lines

# install.packages("Hmisc")
# library(Hmisc)
# run this next line in the console to see how the function cut2 works
# help(cut2)

# would the cut2 function from the package Hmisc help?
#**end of optional research**

#================conclusion=============================================
# of the three, I think the ifelse "handcrafted" version is easiest to grasp in this case.
# the last task is to make a *factor* out of the incg.2 variable

incg.2<-factor(meddf$incg.2,levels=c(1,2,3),labels=c("low","med","high"))