In this dataset, the columns are gender, department, admitted and whether or not they are rejected. I am going to try to clean up this dataset.
rejected<-read.csv("https://raw.githubusercontent.com/Sangeetha-007/R-Practice/master/607/Projects/Project%202/Admitted_Rejected%20-%20Sheet1.csv")
rejected
## Gender Dept Admitted Rejected
## 1 Male A 512 313
## 2 Female A 89 19
## 3 Male B 353 207
## 4 Female B 17 8
## 5 Male C 120 205
## 6 Female C 202 391
## 7 Male D 138 279
## 8 Female D 131 244
## 9 Male E 53 138
## 10 Female E 94 299
## 11 Male F 22 351
## 12 Female F 24 317
sapply(rejected, class)
## Gender Dept Admitted Rejected
## "character" "character" "integer" "integer"
new_df <- rejected |> pivot_longer(cols=c("Admitted", "Rejected"), names_to="Status",
values_to="Count")
new_df
## # A tibble: 24 × 4
## Gender Dept Status Count
## <chr> <chr> <chr> <int>
## 1 Male A Admitted 512
## 2 Male A Rejected 313
## 3 Female A Admitted 89
## 4 Female A Rejected 19
## 5 Male B Admitted 353
## 6 Male B Rejected 207
## 7 Female B Admitted 17
## 8 Female B Rejected 8
## 9 Male C Admitted 120
## 10 Male C Rejected 205
## # … with 14 more rows
dfForGraph<- new_df %>% group_by(Gender, Status) %>% summarize(Sum = sum(Count))
dfForGraph
## # A tibble: 4 × 3
## # Groups: Gender [2]
## Gender Status Sum
## <chr> <chr> <int>
## 1 Female Admitted 557
## 2 Female Rejected 1278
## 3 Male Admitted 1198
## 4 Male Rejected 1493
ggplot(dfForGraph, aes( y=`Sum`, x=Gender, fill=Gender)) +
geom_bar(position="dodge", stat="identity")+facet_wrap(~Status)
Source: https://r-graph-gallery.com/48-grouped-barplot-with-ggplot2
This dataset presents a bunch of test scores for students, just by a quick look at the dataset, we can tell that the data is "dirty".
grades<-read.csv("https://gist.githubusercontent.com/Kimmirikwa/b69d0ea134820ea52f8481991ffae93e/raw/4db7b1698035ee29885d10e1a59bd902716ae168/student_results.csv")
grades
## id name phone sex.and.age test.number term.1 term.2 term.3
## 1 1 Mike 134 m_12 test 1 76 84 87
## 2 2 Linda 270 f_13 test 1 88 90 73
## 3 3 Sam 210 m_11 test 1 78 74 80
## 4 4 Esther 617 f_12 test 1 68 75 74
## 5 5 Mary 114 f_14 test 1 65 67 64
## 6 1 Mike 134 m_12 test 2 85 80 90
## 7 2 Linda 270 f_13 test 2 87 82 94
## 8 3 Sam 210 m_11 test 2 80 87 80
## 9 4 Esther 617 f_12 test 2 70 75 78
## 10 5 Mary 114 f_14 test 2 68 70 63
class(grades)
## [1] "data.frame"
grades[c('Gender', 'Age')] <- str_split_fixed(grades$sex.and.age, '_', 2)
grades
## id name phone sex.and.age test.number term.1 term.2 term.3 Gender Age
## 1 1 Mike 134 m_12 test 1 76 84 87 m 12
## 2 2 Linda 270 f_13 test 1 88 90 73 f 13
## 3 3 Sam 210 m_11 test 1 78 74 80 m 11
## 4 4 Esther 617 f_12 test 1 68 75 74 f 12
## 5 5 Mary 114 f_14 test 1 65 67 64 f 14
## 6 1 Mike 134 m_12 test 2 85 80 90 m 12
## 7 2 Linda 270 f_13 test 2 87 82 94 f 13
## 8 3 Sam 210 m_11 test 2 80 87 80 m 11
## 9 4 Esther 617 f_12 test 2 70 75 78 f 12
## 10 5 Mary 114 f_14 test 2 68 70 63 f 14
grades
## id name phone sex.and.age test.number term.1 term.2 term.3 Gender Age
## 1 1 Mike 134 m_12 test 1 76 84 87 m 12
## 2 2 Linda 270 f_13 test 1 88 90 73 f 13
## 3 3 Sam 210 m_11 test 1 78 74 80 m 11
## 4 4 Esther 617 f_12 test 1 68 75 74 f 12
## 5 5 Mary 114 f_14 test 1 65 67 64 f 14
## 6 1 Mike 134 m_12 test 2 85 80 90 m 12
## 7 2 Linda 270 f_13 test 2 87 82 94 f 13
## 8 3 Sam 210 m_11 test 2 80 87 80 m 11
## 9 4 Esther 617 f_12 test 2 70 75 78 f 12
## 10 5 Mary 114 f_14 test 2 68 70 63 f 14
grades <- subset(grades, select = -c(4) )
grades
## id name phone test.number term.1 term.2 term.3 Gender Age
## 1 1 Mike 134 test 1 76 84 87 m 12
## 2 2 Linda 270 test 1 88 90 73 f 13
## 3 3 Sam 210 test 1 78 74 80 m 11
## 4 4 Esther 617 test 1 68 75 74 f 12
## 5 5 Mary 114 test 1 65 67 64 f 14
## 6 1 Mike 134 test 2 85 80 90 m 12
## 7 2 Linda 270 test 2 87 82 94 f 13
## 8 3 Sam 210 test 2 80 87 80 m 11
## 9 4 Esther 617 test 2 70 75 78 f 12
## 10 5 Mary 114 test 2 68 70 63 f 14
grades<- grades %>%
gather("Term", "Grades", 5:7)
grades
## id name phone test.number Gender Age Term Grades
## 1 1 Mike 134 test 1 m 12 term.1 76
## 2 2 Linda 270 test 1 f 13 term.1 88
## 3 3 Sam 210 test 1 m 11 term.1 78
## 4 4 Esther 617 test 1 f 12 term.1 68
## 5 5 Mary 114 test 1 f 14 term.1 65
## 6 1 Mike 134 test 2 m 12 term.1 85
## 7 2 Linda 270 test 2 f 13 term.1 87
## 8 3 Sam 210 test 2 m 11 term.1 80
## 9 4 Esther 617 test 2 f 12 term.1 70
## 10 5 Mary 114 test 2 f 14 term.1 68
## 11 1 Mike 134 test 1 m 12 term.2 84
## 12 2 Linda 270 test 1 f 13 term.2 90
## 13 3 Sam 210 test 1 m 11 term.2 74
## 14 4 Esther 617 test 1 f 12 term.2 75
## 15 5 Mary 114 test 1 f 14 term.2 67
## 16 1 Mike 134 test 2 m 12 term.2 80
## 17 2 Linda 270 test 2 f 13 term.2 82
## 18 3 Sam 210 test 2 m 11 term.2 87
## 19 4 Esther 617 test 2 f 12 term.2 75
## 20 5 Mary 114 test 2 f 14 term.2 70
## 21 1 Mike 134 test 1 m 12 term.3 87
## 22 2 Linda 270 test 1 f 13 term.3 73
## 23 3 Sam 210 test 1 m 11 term.3 80
## 24 4 Esther 617 test 1 f 12 term.3 74
## 25 5 Mary 114 test 1 f 14 term.3 64
## 26 1 Mike 134 test 2 m 12 term.3 90
## 27 2 Linda 270 test 2 f 13 term.3 94
## 28 3 Sam 210 test 2 m 11 term.3 80
## 29 4 Esther 617 test 2 f 12 term.3 78
## 30 5 Mary 114 test 2 f 14 term.3 63
new <- cbind(read.table(text = as.character(grades$Term), sep=".",
header = FALSE, col.names = c("Col1", "Col2")))
new
## Col1 Col2
## 1 term 1
## 2 term 1
## 3 term 1
## 4 term 1
## 5 term 1
## 6 term 1
## 7 term 1
## 8 term 1
## 9 term 1
## 10 term 1
## 11 term 2
## 12 term 2
## 13 term 2
## 14 term 2
## 15 term 2
## 16 term 2
## 17 term 2
## 18 term 2
## 19 term 2
## 20 term 2
## 21 term 3
## 22 term 3
## 23 term 3
## 24 term 3
## 25 term 3
## 26 term 3
## 27 term 3
## 28 term 3
## 29 term 3
## 30 term 3
class(new)
## [1] "data.frame"
grades_updated <- cbind(grades, new)
grades_updated
## id name phone test.number Gender Age Term Grades Col1 Col2
## 1 1 Mike 134 test 1 m 12 term.1 76 term 1
## 2 2 Linda 270 test 1 f 13 term.1 88 term 1
## 3 3 Sam 210 test 1 m 11 term.1 78 term 1
## 4 4 Esther 617 test 1 f 12 term.1 68 term 1
## 5 5 Mary 114 test 1 f 14 term.1 65 term 1
## 6 1 Mike 134 test 2 m 12 term.1 85 term 1
## 7 2 Linda 270 test 2 f 13 term.1 87 term 1
## 8 3 Sam 210 test 2 m 11 term.1 80 term 1
## 9 4 Esther 617 test 2 f 12 term.1 70 term 1
## 10 5 Mary 114 test 2 f 14 term.1 68 term 1
## 11 1 Mike 134 test 1 m 12 term.2 84 term 2
## 12 2 Linda 270 test 1 f 13 term.2 90 term 2
## 13 3 Sam 210 test 1 m 11 term.2 74 term 2
## 14 4 Esther 617 test 1 f 12 term.2 75 term 2
## 15 5 Mary 114 test 1 f 14 term.2 67 term 2
## 16 1 Mike 134 test 2 m 12 term.2 80 term 2
## 17 2 Linda 270 test 2 f 13 term.2 82 term 2
## 18 3 Sam 210 test 2 m 11 term.2 87 term 2
## 19 4 Esther 617 test 2 f 12 term.2 75 term 2
## 20 5 Mary 114 test 2 f 14 term.2 70 term 2
## 21 1 Mike 134 test 1 m 12 term.3 87 term 3
## 22 2 Linda 270 test 1 f 13 term.3 73 term 3
## 23 3 Sam 210 test 1 m 11 term.3 80 term 3
## 24 4 Esther 617 test 1 f 12 term.3 74 term 3
## 25 5 Mary 114 test 1 f 14 term.3 64 term 3
## 26 1 Mike 134 test 2 m 12 term.3 90 term 3
## 27 2 Linda 270 test 2 f 13 term.3 94 term 3
## 28 3 Sam 210 test 2 m 11 term.3 80 term 3
## 29 4 Esther 617 test 2 f 12 term.3 78 term 3
## 30 5 Mary 114 test 2 f 14 term.3 63 term 3
colnames(grades_updated)
## [1] "id" "name" "phone" "test.number" "Gender"
## [6] "Age" "Term" "Grades" "Col1" "Col2"
grades_updated <- subset(grades_updated, select = -c(Col1, Term))
grades_updated
## id name phone test.number Gender Age Grades Col2
## 1 1 Mike 134 test 1 m 12 76 1
## 2 2 Linda 270 test 1 f 13 88 1
## 3 3 Sam 210 test 1 m 11 78 1
## 4 4 Esther 617 test 1 f 12 68 1
## 5 5 Mary 114 test 1 f 14 65 1
## 6 1 Mike 134 test 2 m 12 85 1
## 7 2 Linda 270 test 2 f 13 87 1
## 8 3 Sam 210 test 2 m 11 80 1
## 9 4 Esther 617 test 2 f 12 70 1
## 10 5 Mary 114 test 2 f 14 68 1
## 11 1 Mike 134 test 1 m 12 84 2
## 12 2 Linda 270 test 1 f 13 90 2
## 13 3 Sam 210 test 1 m 11 74 2
## 14 4 Esther 617 test 1 f 12 75 2
## 15 5 Mary 114 test 1 f 14 67 2
## 16 1 Mike 134 test 2 m 12 80 2
## 17 2 Linda 270 test 2 f 13 82 2
## 18 3 Sam 210 test 2 m 11 87 2
## 19 4 Esther 617 test 2 f 12 75 2
## 20 5 Mary 114 test 2 f 14 70 2
## 21 1 Mike 134 test 1 m 12 87 3
## 22 2 Linda 270 test 1 f 13 73 3
## 23 3 Sam 210 test 1 m 11 80 3
## 24 4 Esther 617 test 1 f 12 74 3
## 25 5 Mary 114 test 1 f 14 64 3
## 26 1 Mike 134 test 2 m 12 90 3
## 27 2 Linda 270 test 2 f 13 94 3
## 28 3 Sam 210 test 2 m 11 80 3
## 29 4 Esther 617 test 2 f 12 78 3
## 30 5 Mary 114 test 2 f 14 63 3
grades_updated <- grades_updated %>%
rename(Term = Col2)
grades_updated
## id name phone test.number Gender Age Grades Term
## 1 1 Mike 134 test 1 m 12 76 1
## 2 2 Linda 270 test 1 f 13 88 1
## 3 3 Sam 210 test 1 m 11 78 1
## 4 4 Esther 617 test 1 f 12 68 1
## 5 5 Mary 114 test 1 f 14 65 1
## 6 1 Mike 134 test 2 m 12 85 1
## 7 2 Linda 270 test 2 f 13 87 1
## 8 3 Sam 210 test 2 m 11 80 1
## 9 4 Esther 617 test 2 f 12 70 1
## 10 5 Mary 114 test 2 f 14 68 1
## 11 1 Mike 134 test 1 m 12 84 2
## 12 2 Linda 270 test 1 f 13 90 2
## 13 3 Sam 210 test 1 m 11 74 2
## 14 4 Esther 617 test 1 f 12 75 2
## 15 5 Mary 114 test 1 f 14 67 2
## 16 1 Mike 134 test 2 m 12 80 2
## 17 2 Linda 270 test 2 f 13 82 2
## 18 3 Sam 210 test 2 m 11 87 2
## 19 4 Esther 617 test 2 f 12 75 2
## 20 5 Mary 114 test 2 f 14 70 2
## 21 1 Mike 134 test 1 m 12 87 3
## 22 2 Linda 270 test 1 f 13 73 3
## 23 3 Sam 210 test 1 m 11 80 3
## 24 4 Esther 617 test 1 f 12 74 3
## 25 5 Mary 114 test 1 f 14 64 3
## 26 1 Mike 134 test 2 m 12 90 3
## 27 2 Linda 270 test 2 f 13 94 3
## 28 3 Sam 210 test 2 m 11 80 3
## 29 4 Esther 617 test 2 f 12 78 3
## 30 5 Mary 114 test 2 f 14 63 3
grades_updated<-spread(grades_updated, "test.number", 7)
grades_updated
## id name phone Gender Age Term test 1 test 2
## 1 1 Mike 134 m 12 1 76 85
## 2 1 Mike 134 m 12 2 84 80
## 3 1 Mike 134 m 12 3 87 90
## 4 2 Linda 270 f 13 1 88 87
## 5 2 Linda 270 f 13 2 90 82
## 6 2 Linda 270 f 13 3 73 94
## 7 3 Sam 210 m 11 1 78 80
## 8 3 Sam 210 m 11 2 74 87
## 9 3 Sam 210 m 11 3 80 80
## 10 4 Esther 617 f 12 1 68 70
## 11 4 Esther 617 f 12 2 75 75
## 12 4 Esther 617 f 12 3 74 78
## 13 5 Mary 114 f 14 1 65 68
## 14 5 Mary 114 f 14 2 67 70
## 15 5 Mary 114 f 14 3 64 63
grades_updated %>%
group_by(phone) %>%
summarise(Mean=mean(`test 1`))
## # A tibble: 5 × 2
## phone Mean
## <int> <dbl>
## 1 114 65.3
## 2 134 82.3
## 3 210 77.3
## 4 270 83.7
## 5 617 72.3
averagesOfeachStudent<- grades_updated %>%
group_by(name, Term) %>%
summarise(Average=(`test 1`+`test 2`)/2)
averagesOfeachStudent
## # A tibble: 15 × 3
## # Groups: name [5]
## name Term Average
## <chr> <int> <dbl>
## 1 Esther 1 69
## 2 Esther 2 75
## 3 Esther 3 76
## 4 Linda 1 87.5
## 5 Linda 2 86
## 6 Linda 3 83.5
## 7 Mary 1 66.5
## 8 Mary 2 68.5
## 9 Mary 3 63.5
## 10 Mike 1 80.5
## 11 Mike 2 82
## 12 Mike 3 88.5
## 13 Sam 1 79
## 14 Sam 2 80.5
## 15 Sam 3 80
averagesOfeachStudent %>% group_by(name)%>%
summarise(Mean=mean(Average))
## # A tibble: 5 × 2
## name Mean
## <chr> <dbl>
## 1 Esther 73.3
## 2 Linda 85.7
## 3 Mary 66.2
## 4 Mike 83.7
## 5 Sam 79.8
Sources: https://www.listendata.com/2015/06/r-keep-drop-columns-from-data-frame.html, https://www.geeksforgeeks.org/how-to-split-column-into-multiple-columns-in-r-dataframe/, https://stackoverflow.com/questions/66953570/r-split-one-column-into-two-when-the-divider-is-a-dot, https://www.programmingr.com/examples/r-dataframe/cbind-in-r/, https://stackoverflow.com/questions/4605206/drop-data-frame-columns-by-name, http://statseducation.com/Introduction-to-R/modules/tidy%20data/spread/
fruits<-read.csv("https://raw.githubusercontent.com/Sangeetha-007/R-Practice/master/607/Projects/Project%202/Fruit%20Prices.csv")
fruits
## item price calories
## 1 "banana" "$1" 105
## 2 "apple" "0.75" 95
## 3 "apple" "0.75" 95
## 4 "peach" "$3" 55
## 5 "peach" "$4" 55
## 6 "clementine" "2.5" 35
duplicated(fruits)
## [1] FALSE FALSE TRUE FALSE FALSE FALSE
fruits<- distinct(fruits)
fruits
## item price calories
## 1 "banana" "$1" 105
## 2 "apple" "0.75" 95
## 3 "peach" "$3" 55
## 4 "peach" "$4" 55
## 5 "clementine" "2.5" 35
fruits$item<-gsub('"',"",as.character(fruits$item))
fruits
## item price calories
## 1 banana "$1" 105
## 2 apple "0.75" 95
## 3 peach "$3" 55
## 4 peach "$4" 55
## 5 clementine "2.5" 35
class(fruits)
## [1] "data.frame"
fruits$price<- gsub('"',"",as.character(fruits$price))
#This code did not work.
#fruits$price<- gsub('$',"",as.character(fruits$price))
#fruits
fruits %>%
mutate(across(starts_with("price"), ~gsub("\\$", "", .) %>% as.numeric))
## item price calories
## 1 banana 1.00 105
## 2 apple 0.75 95
## 3 peach 3.00 55
## 4 peach 4.00 55
## 5 clementine 2.50 35
sapply (fruits, class)
## item price calories
## "character" "character" "integer"
fruits
## item price calories
## 1 banana $1 105
## 2 apple 0.75 95
## 3 peach $3 55
## 4 peach $4 55
## 5 clementine 2.5 35
fruits<- fruits %>%
mutate(price = parse_number(price))
sapply(fruits, class)
## item price calories
## "character" "numeric" "integer"
fruits
## item price calories
## 1 banana 1.00 105
## 2 apple 0.75 95
## 3 peach 3.00 55
## 4 peach 4.00 55
## 5 clementine 2.50 35
summarise(fruits)
## data frame with 0 columns and 1 row
ggplot(fruits, aes( y=`calories`, x=item, fill=item)) +
geom_bar(position="dodge",stat = "identity")+
scale_fill_manual(values=c("#66b447",
"#ffe135",
"orange",
"#DE7E5D"))
Source: https://www.datanovia.com/en/lessons/identify-and-remove-duplicate-data-in-r/, https://stackoverflow.com/questions/64741916/how-to-remove-the-dollar-sign-in-r