library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.1
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.1.1
## Warning: package 'readr' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
url <- "https://raw.githubusercontent.com/omocharly/DATA607_PROJECTS/main/Untidy%20Data%20-%20By%20Eric%20Lehmphul.csv"
testdata <- read.csv(url)
testdata
## Student Test1..TimeStudiedTest1 Test2..TimeStudiedTest2
## 1 Bob 95, 45 88, 40
## 2 John 85, 35 60, 8
## 3 Sam 78, 15 75, 16
## 4 Jenna 92, 60 94, 65
## 5 Sara 97, 40 98, 50
## 6 Jacob 50, 5 40, 2
## 7 Melinda <NA> 90, 47
## 8 Billy 78, 15 80, 25
## 9 Kayla 100, 40 100,40
## 10 Nick 90, 35 94, 32
## 11 Nicolete 75, 20 80, 20
## Test3..TimeStudiedTest3 Test4..TimeStudiedTest4 Gender X
## 1 92, 50 100, 70 Male NA
## 2 75, 10 87, 25 Male NA
## 3 80, 17 76, 10 Female NA
## 4 84, 60 100, 75 Female NA
## 5 95, 45 92, 60 Female NA
## 6 <NA> <NA> Male NA
## 7 92, 55 89, 60 Female NA
## 8 81, 36 86, 30 Male NA
## 9 100, 45 100, 50 Female NA
## 10 94, 30 90, 30 Male NA
## 11 85, 23 79, 10 Female NA
tidy_testdata = testdata %>%
separate(Test1..TimeStudiedTest1,
into = c("Test1","TimeStudiedTest1"),
sep = ", ", convert=TRUE) %>%
separate(Test2..TimeStudiedTest2,
into = c("Test2","TimeStudiedTest2"),
sep = ",", convert=TRUE) %>%
separate(Test3..TimeStudiedTest3,
into = c("Test3","TimeStudiedTest3"),
sep = ", ", convert=TRUE) %>%
separate(Test4..TimeStudiedTest4,
into = c("Test4","TimeStudiedTest4"),
sep = ", ", convert=TRUE)
# To rearrnge the Columns
test_data <- tidy_testdata %>% select(Student, Gender, Test1, Test2, Test3, TimeStudiedTest1, TimeStudiedTest2, TimeStudiedTest3)
test_data
## Student Gender Test1 Test2 Test3 TimeStudiedTest1 TimeStudiedTest2
## 1 Bob Male 95 88 92 45 40
## 2 John Male 85 60 75 35 8
## 3 Sam Female 78 75 80 15 16
## 4 Jenna Female 92 94 84 60 65
## 5 Sara Female 97 98 95 40 50
## 6 Jacob Male 50 40 NA 5 2
## 7 Melinda Female NA 90 92 NA 47
## 8 Billy Male 78 80 81 15 25
## 9 Kayla Female 100 100 100 40 40
## 10 Nick Male 90 94 94 35 32
## 11 Nicolete Female 75 80 85 20 20
## TimeStudiedTest3
## 1 50
## 2 10
## 3 17
## 4 60
## 5 45
## 6 NA
## 7 55
## 8 36
## 9 45
## 10 30
## 11 23
Replace “NA” with Zero(0).
test_data1 <- test_data %>% replace(is.na(.), 0)
test_data1
## Student Gender Test1 Test2 Test3 TimeStudiedTest1 TimeStudiedTest2
## 1 Bob Male 95 88 92 45 40
## 2 John Male 85 60 75 35 8
## 3 Sam Female 78 75 80 15 16
## 4 Jenna Female 92 94 84 60 65
## 5 Sara Female 97 98 95 40 50
## 6 Jacob Male 50 40 0 5 2
## 7 Melinda Female 0 90 92 0 47
## 8 Billy Male 78 80 81 15 25
## 9 Kayla Female 100 100 100 40 40
## 10 Nick Male 90 94 94 35 32
## 11 Nicolete Female 75 80 85 20 20
## TimeStudiedTest3
## 1 50
## 2 10
## 3 17
## 4 60
## 5 45
## 6 0
## 7 55
## 8 36
## 9 45
## 10 30
## 11 23
test_score <- mutate(test_data1,
Avg_score = round((Test1 + Test2 + Test3)/3,0),
Avg_studytime = round((TimeStudiedTest1 + TimeStudiedTest2 + TimeStudiedTest3)/3,0))
test_score
## Student Gender Test1 Test2 Test3 TimeStudiedTest1 TimeStudiedTest2
## 1 Bob Male 95 88 92 45 40
## 2 John Male 85 60 75 35 8
## 3 Sam Female 78 75 80 15 16
## 4 Jenna Female 92 94 84 60 65
## 5 Sara Female 97 98 95 40 50
## 6 Jacob Male 50 40 0 5 2
## 7 Melinda Female 0 90 92 0 47
## 8 Billy Male 78 80 81 15 25
## 9 Kayla Female 100 100 100 40 40
## 10 Nick Male 90 94 94 35 32
## 11 Nicolete Female 75 80 85 20 20
## TimeStudiedTest3 Avg_score Avg_studytime
## 1 50 92 45
## 2 10 73 18
## 3 17 78 16
## 4 60 90 62
## 5 45 97 45
## 6 0 30 2
## 7 55 61 34
## 8 36 80 25
## 9 45 100 42
## 10 30 93 32
## 11 23 80 21
test_score <- select(test_score, Student, Gender, Avg_score, Avg_studytime)
test_score
## Student Gender Avg_score Avg_studytime
## 1 Bob Male 92 45
## 2 John Male 73 18
## 3 Sam Female 78 16
## 4 Jenna Female 90 62
## 5 Sara Female 97 45
## 6 Jacob Male 30 2
## 7 Melinda Female 61 34
## 8 Billy Male 80 25
## 9 Kayla Female 100 42
## 10 Nick Male 93 32
## 11 Nicolete Female 80 21
scorevtime <- ggplot(data = test_score, aes(Avg_studytime, Avg_score)) + geom_point() +
geom_smooth(se = FALSE) + ylab("Average Score") + xlab("Average Study Time") + theme_bw() +
labs(title = "Average Test Score vs Average Study Time")
scorevtime
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
The correlation between them is 0.7278
x <- test_score$Avg_studytime
y <- test_score$Avg_score
cor.test(x, y)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 3.1843, df = 9, p-value = 0.01111
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2271646 0.9242026
## sample estimates:
## cor
## 0.7278546
url2 <- "https://raw.githubusercontent.com/omocharly/DATA607_PROJECTS/main/Untidy%20Data%20%20-%20Charles%20Ugiagbe.csv"
Exam_score <- read.csv(url2)
head(Exam_score)
## Timestamp Entering.Grade.Level District Birth.Month OLSAT.Verbal.Score
## 1 4/8/2017 6:44 1 6 September 28
## 2 4/7/2017 10:40 K NA August 25
## 3 4/7/2017 10:41 1 NA March 27
## 4 4/7/2017 10:43 K NA September 23
## 5 4/10/2017 10:18 K 22 April 25
## 6 4/7/2017 11:03 K NA May 24
## OLSAT.Verbal.Percentile NNAT.Non.Verbal.Raw.Score NNAT.Non.Verbal.Percentile
## 1 99 45 99
## 2 99 39 99
## 3 96 42 99
## 4 97 40 99
## 5 98 38 99
## 6 97 36 98
## Overall.Score School.Preferences
## 1 99 NEST+m, TAG, Anderson, Q300
## 2 99 Anderson, NEST+m
## 3 98
## 4 98
## 5 99 Brooklyn School of Inquiry
## 6 98
## School.Assigned Will.you.enroll.there.
## 1 NEST YES
## 2 Maybe
## 3 Maybe
## 4
## 5 Currently - local Brooklyn dual language Maybe
## 6
Exam_score$date_component <- as.Date(Exam_score$Timestamp,'%d/%m/%Y')
glimpse(Exam_score)
## Rows: 117
## Columns: 13
## $ Timestamp <chr> "4/8/2017 6:44", "4/7/2017 10:40", "4/7/201~
## $ Entering.Grade.Level <chr> "1", "K", "1", "K", "K", "K", "K", "K", "1"~
## $ District <int> 6, NA, NA, NA, 22, NA, NA, NA, NA, NA, NA, ~
## $ Birth.Month <chr> "September", "August", "March", "September"~
## $ OLSAT.Verbal.Score <chr> "28", "25", "27", "23", "25", "24", "26", "~
## $ OLSAT.Verbal.Percentile <chr> "99", "99", "96", "97", "98", "97", "99", "~
## $ NNAT.Non.Verbal.Raw.Score <chr> "45", "39", "42", "40", "38", "36", "42", "~
## $ NNAT.Non.Verbal.Percentile <int> 99, 99, 99, 99, 99, 98, 99, 99, 99, 99, 99,~
## $ Overall.Score <int> 99, 99, 98, 98, 99, 98, 99, 99, 95, 99, 94,~
## $ School.Preferences <chr> "NEST+m, TAG, Anderson, Q300", "Anderson, N~
## $ School.Assigned <chr> "NEST", "", "", "", "Currently - local Broo~
## $ Will.you.enroll.there. <chr> "YES", "Maybe", "Maybe", "", "Maybe", "", "~
## $ date_component <date> 2017-08-04, 2017-07-04, 2017-07-04, 2017-0~
Exam_score <- select(Exam_score, date_component, Entering.Grade.Level, Birth.Month, OLSAT.Verbal.Score, NNAT.Non.Verbal.Raw.Score)
head(Exam_score)
## date_component Entering.Grade.Level Birth.Month OLSAT.Verbal.Score
## 1 2017-08-04 1 September 28
## 2 2017-07-04 K August 25
## 3 2017-07-04 1 March 27
## 4 2017-07-04 K September 23
## 5 2017-10-04 K April 25
## 6 2017-07-04 K May 24
## NNAT.Non.Verbal.Raw.Score
## 1 45
## 2 39
## 3 42
## 4 40
## 5 38
## 6 36
Exam_score1 = Exam_score %>% rename(
Date = date_component,
Entering_Grade_lvl = Entering.Grade.Level,
Birth_month = Birth.Month,
Verbal_score = OLSAT.Verbal.Score,
Non_verbal_score = NNAT.Non.Verbal.Raw.Score)
Exam_score1
## Date Entering_Grade_lvl Birth_month Verbal_score Non_verbal_score
## 1 2017-08-04 1 September 28 45
## 2 2017-07-04 K August 25 39
## 3 2017-07-04 1 March 27 42
## 4 2017-07-04 K September 23 40
## 5 2017-10-04 K April 25 38
## 6 2017-07-04 K May 24 36
## 7 2017-07-04 K February 26 42
## 8 2017-07-04 K October 24 42
## 9 2017-07-04 1 March 23 42
## 10 2017-07-04 2 April 29 44
## 11 2017-07-04 K June 17 39
## 12 2017-07-04 K December 23 45
## 13 2017-07-04 1 July 24 39
## 14 2017-07-04 1 May 29 48
## 15 2017-07-04 1 January 28 43
## 16 2017-07-04 K January 26 40
## 17 2017-07-04 1 January 26 46
## 18 2017-07-04 1 June 25 41
## 19 2017-08-04 K December 21 43
## 20 2017-08-04 K August 25 39
## 21 2017-08-04 K May 28 46
## 22 2017-08-04 K July 28 38
## 23 2017-08-04 2 February 28 44
## 24 2017-08-04 1 November 28 41
## 25 2017-08-04 1 January 26 42
## 26 2017-08-04 2 January 28 43
## 27 2017-08-04 1 March 28 45
## 28 2017-08-04 2 November 28 47
## 29 2017-08-04 1 January 26 46
## 30 2017-08-04 K February 27 46
## 31 2017-09-04 K May 27 45
## 32 2017-10-04 K March 32 40
## 33 2017-10-04 K May 19 40
## 34 2017-10-04 K May 24/30 36/48
## 35 2017-10-04 K August 13 27
## 36 2017-10-04 K August 29 37
## 37 2017-11-04 K July 18 38
## 38 2017-11-04 K December 23/30 40/48
## 39 2017-12-04 K November 18 45
## 40 2017-12-04 K July 22 41
## 41 <NA> K April 23 39
## 42 <NA> K March 17/30 36/48
## 43 <NA> 1 January 24 42
## 44 <NA> K October 26 35
## 45 <NA> K February 26 40
## 46 <NA> 1 December 27 43
## 47 <NA> 1 January 28 46
## 48 <NA> K March 24 44
## 49 <NA> 1 December 29 45
## 50 <NA> K September 28 42
## 51 <NA> 1 March 29 44
## 52 <NA> K February 20 33
## 53 <NA> 2 December 28 44
## 54 <NA> K March 25 40
## 55 <NA> K January 24 45
## 56 <NA> K April 24 40
## 57 <NA> K July 24 34
## 58 <NA> K February 23 37
## 59 <NA> K July 27 38
## 60 <NA> K October 22 34
## 61 <NA> K May 25 36
## 62 <NA> K March 26 45
## 63 <NA> 1 January 26 39
## 64 <NA> K March 28 43
## 65 <NA> 1 April 23 45
## 66 <NA> 1 May 29 48
## 67 <NA> 2 July 29 40
## 68 <NA> K January 24 40
## 69 <NA> K September 99 99
## 70 <NA> K January 25 41
## 71 <NA> K January 19 38
## 72 <NA> 1 May 29 44
## 73 <NA> 3 October 90 90
## 74 <NA> K November 20 39
## 75 <NA> K March 21 39
## 76 <NA> 2 November 28 47
## 77 <NA> 3 March 30 39
## 78 2017-03-05 K December 23 37
## 79 2017-03-05 K December 23 37
## 80 2017-03-05 1 February 21 42
## 81 2017-04-05 K January 24/30 39/48
## 82 2017-05-05 K May 35 40
## 83 2017-05-05 K March 22 40
## 84 2017-07-05 1 October 28 41
## 85 2017-08-05 K August Fill out later. Fill out later.
## 86 2017-09-05 2 May 28 37
## 87 2017-10-05 K July 19/30 41/48
## 88 <NA> 1 November 29 44
## 89 <NA> K December 26 46
## 90 <NA> K February 23 40
## 91 <NA> K February 25 41
## 92 <NA> K October 30 43
## 93 <NA> K February 30 42
## 94 2017-01-06 K June 24 43
## 95 2017-01-06 K May 25 36
## 96 2017-01-06 K April 25 38
## 97 2017-01-06 K January 23 38
## 98 2017-01-06 2 October 28 47
## 99 2017-01-06 K July 83 99
## 100 2017-01-06 K May 26 44
## 101 2017-01-06 2 April 29 44
## 102 2017-01-06 1 December 27 43
## 103 2017-01-06 K April 29 42
## 104 2017-01-06 K January 28 41
## 105 2017-02-06 1 March ** **
## 106 2017-02-06 K July 24 34
## 107 2017-02-06 K April 23 39
## 108 2017-04-06 K April 27 37
## 109 2017-04-06 1 November - -
## 110 2017-06-06 K April 25 39
## 111 2017-06-06 K January 24 45
## 112 2017-07-06 K December 99 99
## 113 <NA> 1 August 30 44
## 114 <NA> K July 97 97
## 115 <NA> K February 26 40
## 116 <NA> K April 19 33
## 117 2019-11-01 K January 24 44
Exam_score2 <- ggplot(data = Exam_score1, aes(as.numeric(Verbal_score), as.numeric(Non_verbal_score))) + geom_point() +
geom_smooth(se = FALSE) + ylab("Verbal Score") + xlab("Non Verbal Score") + theme_bw() +
labs(title = "Verbal Score vs Non Verbal Score")
Exam_score2
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 8 rows containing non-finite values (stat_smooth).
## Warning: Removed 8 rows containing missing values (geom_point).
url3 <- "https://raw.githubusercontent.com/omocharly/DATA607_PROJECTS/main/Untidy%20Data%20-%20by%20Mauricio%20Claudio.csv"
churn_data <- read.csv(url3)
churn_data
## Division Description Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1 A Gained 70 80 100 110 70 45 50 99 112 99 55 110
## 2 Lost 0 -90 -30 -45 -95 -33 -110 -34 -34 -88 -65 -45
## 3 B Gained 80 80 90 120 100 119 75 119 90 80 80 90
## 4 Lost 0 -15 -30 -25 -50 -77 -45 -77 -30 -15 -20 -30
## 5 C Gained 60 85 80 90 120 45 75 45 80 85 60 80
## 6 Lost 0 -45 -27 -17 -33 -80 -45 -80 -27 -45 -35 -27
churn_data[churn_data==""]=NA
churn_new = churn_data %>%
select(-Description) %>%
fill("Division") %>%
pivot_longer(c(Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec),
names_to="Month",
values_to = "Customers",
names_transform = list(Customers=as.integer)
)
churn_new
## # A tibble: 72 x 3
## Division Month Customers
## <chr> <chr> <int>
## 1 A Jan 70
## 2 A Feb 80
## 3 A Mar 100
## 4 A Apr 110
## 5 A May 70
## 6 A Jun 45
## 7 A Jul 50
## 8 A Aug 99
## 9 A Sep 112
## 10 A Oct 99
## # ... with 62 more rows
churn_new %>% group_by(Division) %>%
summarize(Net_Customers=sum(Customers)) %>%
ggplot(aes(Division,Net_Customers)) + geom_col()
churn_new %>%
group_by(Month) %>%
summarize(Net_Customers=sum(Customers)) %>%
ggplot(aes(Month,Net_Customers)) + geom_col()