set.seed(1234) # to keep the random variables generated fixed for every rerun to trace errors
df <- data.frame(a = LETTERS[sample(5, 15, replace=TRUE)], y = rnorm(15))
df
## a y
## 1 A 0.98340378
## 2 D -0.62245679
## 3 D -0.73153600
## 4 D -0.51666972
## 5 E -1.75073344
## 6 D 0.88010416
## 7 A 1.37001035
## 8 B -1.68732684
## 9 D -0.62743621
## 10 C 0.01831663
## 11 D 0.70524346
## 12 C -0.64701901
## 13 B 0.86818087
## 14 E 0.37563561
## 15 B 0.31026217
str(df)
## 'data.frame': 15 obs. of 2 variables:
## $ a: Factor w/ 5 levels "A","B","C","D",..: 1 4 4 4 5 4 1 2 4 3 ...
## $ y: num 0.983 -0.622 -0.732 -0.517 -1.751 ...
b<-combine_factor(df$a, c(1,2,2,1,2)) # A and D is A; B, C and E is B
c<-combine_factor(df$a, c(1:4, 1)) # A, B, C D is as is; E is A
df.new<-cbind.data.frame(df,b,c) # combining coumns b and c to original data frame df
df.new
## a y b c
## 1 A 0.98340378 A A
## 2 D -0.62245679 A D
## 3 D -0.73153600 A D
## 4 D -0.51666972 A D
## 5 E -1.75073344 B A
## 6 D 0.88010416 A D
## 7 A 1.37001035 A A
## 8 B -1.68732684 B B
## 9 D -0.62743621 A D
## 10 C 0.01831663 B C
## 11 D 0.70524346 A D
## 12 C -0.64701901 B C
## 13 B 0.86818087 B B
## 14 E 0.37563561 B A
## 15 B 0.31026217 B B
# Use ?reorder to read more about it
str(InsectSprays)
## 'data.frame': 72 obs. of 2 variables:
## $ count: num 10 7 20 14 14 12 10 23 17 20 ...
## $ spray: Factor w/ 6 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
bymedian <- with(InsectSprays, reorder(spray, count, median))
boxplot(count ~ bymedian, data = InsectSprays,
xlab = "Type of spray", ylab = "Insect count",
main = "InsectSprays data", varwidth = TRUE,
col = "lightgray")
#### Examples of converting numeric data into categories, taken from https://rcompanion.org/handbook/E_05.html
Input =("
Instructor Student Likert
Homer a 3
Homer b 4
Homer c 4
Homer d 4
Homer e 4
Homer f 5
Homer g 5
Homer h 5
Homer i 3
Homer j 2
Homer k 3
Homer l 4
Homer m 5
Homer n 5
Homer o 5
Homer p 4
Homer q 4
Homer r 3
Homer s 2
Homer t 5
Homer u 3
")
Data = read.table(textConnection(Input),header=TRUE)
### Check the data frame
headTail(Data)
## Instructor Student Likert
## 1 Homer a 3
## 2 Homer b 4
## 3 Homer c 4
## 4 Homer d 4
## ... <NA> <NA> ...
## 18 Homer r 3
## 19 Homer s 2
## 20 Homer t 5
## 21 Homer u 3
str(Data)
## 'data.frame': 21 obs. of 3 variables:
## $ Instructor: Factor w/ 1 level "Homer": 1 1 1 1 1 1 1 1 1 1 ...
## $ Student : Factor w/ 21 levels "a","b","c","d",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Likert : int 3 4 4 4 4 5 5 5 3 2 ...
summary(Data)
## Instructor Student Likert
## Homer:21 a : 1 Min. :2.000
## b : 1 1st Qu.:3.000
## c : 1 Median :4.000
## d : 1 Mean :3.905
## e : 1 3rd Qu.:5.000
## f : 1 Max. :5.000
## (Other):15
#Categorize Data
Data$Category[Data$Likert == 1 | Data$Likert == 2] = "Low"
Data$Category[Data$Likert == 3 ] = "Medium"
Data$Category[Data$Likert == 4 | Data$Likert == 5] = "High"
Data
## Instructor Student Likert Category
## 1 Homer a 3 Medium
## 2 Homer b 4 High
## 3 Homer c 4 High
## 4 Homer d 4 High
## 5 Homer e 4 High
## 6 Homer f 5 High
## 7 Homer g 5 High
## 8 Homer h 5 High
## 9 Homer i 3 Medium
## 10 Homer j 2 Low
## 11 Homer k 3 Medium
## 12 Homer l 4 High
## 13 Homer m 5 High
## 14 Homer n 5 High
## 15 Homer o 5 High
## 16 Homer p 4 High
## 17 Homer q 4 High
## 18 Homer r 3 Medium
## 19 Homer s 2 Low
## 20 Homer t 5 High
## 21 Homer u 3 Medium
#Order factor levels to make output easier to read
Data$Category = factor(Data$Category,
levels=c("Low", "Medium", "High"))
#Summarize counts of categories
XT = xtabs(~ Category + Instructor,
data = Data)
XT
## Instructor
## Category Homer
## Low 2
## Medium 5
## High 14
#Report students in each category
Data$Student[Data$Category == "Low"]
## [1] j s
## Levels: a b c d e f g h i j k l m n o p q r s t u
Data$Student[Data$Category == "Medium"]
## [1] a i k r u
## Levels: a b c d e f g h i j k l m n o p q r s t u
Data$Student[Data$Category == "High"]
## [1] b c d e f g h l m n o p q t
## Levels: a b c d e f g h i j k l m n o p q r s t u
#Categorize data
Percentile_00 = min(Data$Likert)
Percentile_33 = quantile(Data$Likert, 0.33333)
Percentile_67 = quantile(Data$Likert, 0.66667)
Percentile_100 = max(Data$Likert)
# creating and adding variable RB to data frame
RB<-rbind(Percentile_00, Percentile_33, Percentile_67, Percentile_100)
dimnames(RB)[[2]] = "Value"
RB
## Value
## Percentile_00 2.0000
## Percentile_33 3.6666
## Percentile_67 4.3334
## Percentile_100 5.0000
Data$Group[Data$Likert >= Percentile_00 & Data$Likert < Percentile_33] = "Lower_third"
Data$Group[Data$Likert >= Percentile_33 & Data$Likert < Percentile_67] = "Middle_third"
Data$Group[Data$Likert >= Percentile_67 & Data$Likert <= Percentile_100] = "Upper_third"
Data
## Instructor Student Likert Category Group
## 1 Homer a 3 Medium Lower_third
## 2 Homer b 4 High Middle_third
## 3 Homer c 4 High Middle_third
## 4 Homer d 4 High Middle_third
## 5 Homer e 4 High Middle_third
## 6 Homer f 5 High Upper_third
## 7 Homer g 5 High Upper_third
## 8 Homer h 5 High Upper_third
## 9 Homer i 3 Medium Lower_third
## 10 Homer j 2 Low Lower_third
## 11 Homer k 3 Medium Lower_third
## 12 Homer l 4 High Middle_third
## 13 Homer m 5 High Upper_third
## 14 Homer n 5 High Upper_third
## 15 Homer o 5 High Upper_third
## 16 Homer p 4 High Middle_third
## 17 Homer q 4 High Middle_third
## 18 Homer r 3 Medium Lower_third
## 19 Homer s 2 Low Lower_third
## 20 Homer t 5 High Upper_third
## 21 Homer u 3 Medium Lower_third
#Order factor levels to make output easier to read
Data$Group = factor(Data$Group,
levels=c("Lower_third", "Middle_third", "Upper_third"))
#Summarize counts of groups
XT = xtabs(~ Group + Instructor,
data = Data)
XT
## Instructor
## Group Homer
## Lower_third 7
## Middle_third 7
## Upper_third 7