#===================Pre-amble and explanation=============================================
# This script investigates ways to solve the issue we ended with on Monday:
# How do we create a new categorical variable to break the income variable into
# groups - low, mid and high?
# The rule is low is less than 25k
# mid is greater than 25k and less than 35k
# high is 35k and greater
# The way that many R programmers would approach this is demonstrated first, then
# I show a more explicit way using the "ifelse" construct. The issue is
# creating the right intervals
# The results of these are compared by displaying the results side by side with income.
# You should pay attention to *case number 75* (meddf[75]) to see what the difference is.
# I also mention an alternative to cut which is included in a package Hmisc which
# you could investigate. It has more flexibility than cut but not by much...
# In all, in a case where we have one condition that is > or < and another that is
# >= or <=, neither cut is very easy to use or flexible and using _ifelse_
# is perhaps more straightforward.
# ====================beginning of examples proper========================
# read in the data from the remove csv vile
# this script produces some output for you to examine
# if you run it all in one go you will want to scroll back
# through the output and read it in conjunction with the script
# to understand it
meddf <- read.csv("http://www.ucl.ac.uk/~ccaajim/medtrial.csv")
# first, I'm going to try with cut()
# cut() excludes the lower value in an interval and includes the higher.
# if you use breaks c(0,10,20,30) then the intevals are:
# >0 to <=10, >10 to <=20, >20 to <=30
# try the next code line first to help see that.
# it takes the series of integers 1 to 31 and cuts them at
# the breakpoints specified and displays the results as level 1, 2 or three.
# why are 0 and 31 and NA?
x<-as.numeric(cut(1:31,c(0,10,20,30)))
# the **as.numeric** in the line above just forces r to print simple level numbers rather
# than *intervals* - it's cosmetic
# here is an attempt to cut income into a variable incg.1
# the dig.lab option sets to display numers up to ten digits
# rather than scientific notation
meddf$incg.1 <- cut(meddf$income,c(0,25000,35000,100000),dig.lab = 10)
# the next line displays the result of the cut against income for checking
print(cbind(meddf$income, meddf$incg.1))
## [,1] [,2]
## [1,] 46000 3
## [2,] 58000 3
## [3,] 47000 3
## [4,] 55000 3
## [5,] 28000 2
## [6,] 28000 2
## [7,] 61000 3
## [8,] 39000 3
## [9,] NA NA
## [10,] 12000 1
## [11,] 39000 3
## [12,] 38000 3
## [13,] 30000 2
## [14,] 48000 3
## [15,] 36000 3
## [16,] 47000 3
## [17,] 43000 3
## [18,] 54000 3
## [19,] 49000 3
## [20,] 36000 3
## [21,] 58000 3
## [22,] 46000 3
## [23,] 47000 3
## [24,] 44000 3
## [25,] 45000 3
## [26,] 26000 2
## [27,] 44000 3
## [28,] 52000 3
## [29,] 48000 3
## [30,] 61000 3
## [31,] 38000 3
## [32,] 17000 1
## [33,] 47000 3
## [34,] 54000 3
## [35,] 54000 3
## [36,] NA NA
## [37,] 26000 2
## [38,] 57000 3
## [39,] 42000 3
## [40,] 33000 2
## [41,] 29000 2
## [42,] 46000 3
## [43,] 50000 3
## [44,] 49000 3
## [45,] 40000 3
## [46,] 46000 3
## [47,] 15000 1
## [48,] 61000 3
## [49,] 32000 2
## [50,] 47000 3
## [51,] 45000 3
## [52,] 28000 2
## [53,] 47000 3
## [54,] 55000 3
## [55,] 66000 3
## [56,] 44000 3
## [57,] 34000 2
## [58,] 53000 3
## [59,] 48000 3
## [60,] 34000 2
## [61,] 40000 3
## [62,] 49000 3
## [63,] 51000 3
## [64,] 45000 3
## [65,] 39000 3
## [66,] 44000 3
## [67,] 49000 3
## [68,] 49000 3
## [69,] 46000 3
## [70,] 53000 3
## [71,] 33000 2
## [72,] 51000 3
## [73,] 55000 3
## [74,] 23000 1
## [75,] 35000 2
## [76,] 35000 2
## [77,] 23000 1
## [78,] 52000 3
## [79,] 65000 3
## [80,] 50000 3
## [81,] 40000 3
## [82,] 42000 3
## [83,] 40000 3
## [84,] 63000 3
## [85,] 37000 3
## [86,] 47000 3
# **Writing the rule by hand**
#
# this version uses a *control* structure command called *ifelse
# the rule is like this:
#
# ifelse(variablevalue meets condition, outcome,
# ifelse(variablevalue meets secondcondition, secondoutcome,
# defaultoutcome))
# here is the code corresponding to the last comment:
# to run it you must highlight all three lines
meddf$incg.2 <- ifelse (meddf$income >= 35000, 3,
ifelse (meddf$income < 25000,1,
2))
# And now these lines just display the different results
# lined up so that you can check for differences
print(cbind(meddf$income, meddf$incg.2))
## [,1] [,2]
## [1,] 46000 3
## [2,] 58000 3
## [3,] 47000 3
## [4,] 55000 3
## [5,] 28000 2
## [6,] 28000 2
## [7,] 61000 3
## [8,] 39000 3
## [9,] NA NA
## [10,] 12000 1
## [11,] 39000 3
## [12,] 38000 3
## [13,] 30000 2
## [14,] 48000 3
## [15,] 36000 3
## [16,] 47000 3
## [17,] 43000 3
## [18,] 54000 3
## [19,] 49000 3
## [20,] 36000 3
## [21,] 58000 3
## [22,] 46000 3
## [23,] 47000 3
## [24,] 44000 3
## [25,] 45000 3
## [26,] 26000 2
## [27,] 44000 3
## [28,] 52000 3
## [29,] 48000 3
## [30,] 61000 3
## [31,] 38000 3
## [32,] 17000 1
## [33,] 47000 3
## [34,] 54000 3
## [35,] 54000 3
## [36,] NA NA
## [37,] 26000 2
## [38,] 57000 3
## [39,] 42000 3
## [40,] 33000 2
## [41,] 29000 2
## [42,] 46000 3
## [43,] 50000 3
## [44,] 49000 3
## [45,] 40000 3
## [46,] 46000 3
## [47,] 15000 1
## [48,] 61000 3
## [49,] 32000 2
## [50,] 47000 3
## [51,] 45000 3
## [52,] 28000 2
## [53,] 47000 3
## [54,] 55000 3
## [55,] 66000 3
## [56,] 44000 3
## [57,] 34000 2
## [58,] 53000 3
## [59,] 48000 3
## [60,] 34000 2
## [61,] 40000 3
## [62,] 49000 3
## [63,] 51000 3
## [64,] 45000 3
## [65,] 39000 3
## [66,] 44000 3
## [67,] 49000 3
## [68,] 49000 3
## [69,] 46000 3
## [70,] 53000 3
## [71,] 33000 2
## [72,] 51000 3
## [73,] 55000 3
## [74,] 23000 1
## [75,] 35000 3
## [76,] 35000 3
## [77,] 23000 1
## [78,] 52000 3
## [79,] 65000 3
## [80,] 50000 3
## [81,] 40000 3
## [82,] 42000 3
## [83,] 40000 3
## [84,] 63000 3
## [85,] 37000 3
## [86,] 47000 3
print(cbind(meddf$incg.1,meddf$incg.2))
## [,1] [,2]
## [1,] 3 3
## [2,] 3 3
## [3,] 3 3
## [4,] 3 3
## [5,] 2 2
## [6,] 2 2
## [7,] 3 3
## [8,] 3 3
## [9,] NA NA
## [10,] 1 1
## [11,] 3 3
## [12,] 3 3
## [13,] 2 2
## [14,] 3 3
## [15,] 3 3
## [16,] 3 3
## [17,] 3 3
## [18,] 3 3
## [19,] 3 3
## [20,] 3 3
## [21,] 3 3
## [22,] 3 3
## [23,] 3 3
## [24,] 3 3
## [25,] 3 3
## [26,] 2 2
## [27,] 3 3
## [28,] 3 3
## [29,] 3 3
## [30,] 3 3
## [31,] 3 3
## [32,] 1 1
## [33,] 3 3
## [34,] 3 3
## [35,] 3 3
## [36,] NA NA
## [37,] 2 2
## [38,] 3 3
## [39,] 3 3
## [40,] 2 2
## [41,] 2 2
## [42,] 3 3
## [43,] 3 3
## [44,] 3 3
## [45,] 3 3
## [46,] 3 3
## [47,] 1 1
## [48,] 3 3
## [49,] 2 2
## [50,] 3 3
## [51,] 3 3
## [52,] 2 2
## [53,] 3 3
## [54,] 3 3
## [55,] 3 3
## [56,] 3 3
## [57,] 2 2
## [58,] 3 3
## [59,] 3 3
## [60,] 2 2
## [61,] 3 3
## [62,] 3 3
## [63,] 3 3
## [64,] 3 3
## [65,] 3 3
## [66,] 3 3
## [67,] 3 3
## [68,] 3 3
## [69,] 3 3
## [70,] 3 3
## [71,] 2 2
## [72,] 3 3
## [73,] 3 3
## [74,] 1 1
## [75,] 2 3
## [76,] 2 3
## [77,] 1 1
## [78,] 3 3
## [79,] 3 3
## [80,] 3 3
## [81,] 3 3
## [82,] 3 3
## [83,] 3 3
## [84,] 3 3
## [85,] 3 3
## [86,] 3 3
print(as.numeric(meddf$incg.1)!=meddf$incg.2)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE NA FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE NA FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
## [78] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# Did these code lines do what you expected?
# cut() is rather inflexible with its treatment of the limits of intervals.
#**Optional research exercise**
# Read help on cut - is there a solution? Read about cut2 from package hmisc: would this help?
# install the package Hmisc by uncommenting the next two lines
# install.packages("Hmisc")
# library(Hmisc)
# run this next line in the console to see how the function cut2 works
# help(cut2)
# would the cut2 function from the package Hmisc help?
#**end of optional research**
#================conclusion=============================================
# of the three, I think the ifelse "handcrafted" version is easiest to grasp in this case.
# the last task is to make a *factor* out of the incg.2 variable
incg.2<-factor(meddf$incg.2,levels=c(1,2,3),labels=c("low","med","high"))