Tidying and Transforming Wide Data

Title: CUNY SPS MDS DATA607_PROJ2"

Author: Charles Ugiagbe

Date: “10/3/2021”

Untidy Data 1: Submitted by Eric Lehmphul

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.1

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1

## Warning: package 'tibble' was built under R version 4.1.1

## Warning: package 'readr' was built under R version 4.1.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Read the data

url <- "https://raw.githubusercontent.com/omocharly/DATA607_PROJECTS/main/Untidy%20Data%20-%20By%20Eric%20Lehmphul.csv"
testdata <- read.csv(url)
testdata

##     Student Test1..TimeStudiedTest1 Test2..TimeStudiedTest2
## 1       Bob                  95, 45                  88, 40
## 2      John                  85, 35                   60, 8
## 3       Sam                  78, 15                  75, 16
## 4     Jenna                  92, 60                  94, 65
## 5      Sara                  97, 40                  98, 50
## 6     Jacob                   50, 5                   40, 2
## 7   Melinda                    <NA>                  90, 47
## 8     Billy                  78, 15                  80, 25
## 9     Kayla                 100, 40                  100,40
## 10     Nick                  90, 35                  94, 32
## 11 Nicolete                  75, 20                  80, 20
##    Test3..TimeStudiedTest3 Test4..TimeStudiedTest4 Gender  X
## 1                   92, 50                 100, 70   Male NA
## 2                   75, 10                  87, 25   Male NA
## 3                   80, 17                  76, 10 Female NA
## 4                   84, 60                 100, 75 Female NA
## 5                   95, 45                  92, 60 Female NA
## 6                     <NA>                    <NA>   Male NA
## 7                   92, 55                  89, 60 Female NA
## 8                   81, 36                  86, 30   Male NA
## 9                  100, 45                 100, 50 Female NA
## 10                  94, 30                  90, 30   Male NA
## 11                  85, 23                  79, 10 Female NA

Tidy the data by separating the student score fromt the time.

tidy_testdata = testdata %>%
  separate(Test1..TimeStudiedTest1,
           into = c("Test1","TimeStudiedTest1"),
           sep = ", ", convert=TRUE) %>%
  separate(Test2..TimeStudiedTest2,
           into = c("Test2","TimeStudiedTest2"),
           sep = ",", convert=TRUE) %>%
  separate(Test3..TimeStudiedTest3,
           into = c("Test3","TimeStudiedTest3"),
           sep = ", ", convert=TRUE) %>%
  separate(Test4..TimeStudiedTest4,
           into = c("Test4","TimeStudiedTest4"),
           sep = ", ", convert=TRUE)

# To rearrnge the Columns
test_data <- tidy_testdata %>% select(Student, Gender, Test1, Test2, Test3, TimeStudiedTest1, TimeStudiedTest2, TimeStudiedTest3)
test_data

##     Student Gender Test1 Test2 Test3 TimeStudiedTest1 TimeStudiedTest2
## 1       Bob   Male    95    88    92               45               40
## 2      John   Male    85    60    75               35                8
## 3       Sam Female    78    75    80               15               16
## 4     Jenna Female    92    94    84               60               65
## 5      Sara Female    97    98    95               40               50
## 6     Jacob   Male    50    40    NA                5                2
## 7   Melinda Female    NA    90    92               NA               47
## 8     Billy   Male    78    80    81               15               25
## 9     Kayla Female   100   100   100               40               40
## 10     Nick   Male    90    94    94               35               32
## 11 Nicolete Female    75    80    85               20               20
##    TimeStudiedTest3
## 1                50
## 2                10
## 3                17
## 4                60
## 5                45
## 6                NA
## 7                55
## 8                36
## 9                45
## 10               30
## 11               23

Replace “NA” with Zero(0).

test_data1 <- test_data %>% replace(is.na(.), 0)
test_data1

##     Student Gender Test1 Test2 Test3 TimeStudiedTest1 TimeStudiedTest2
## 1       Bob   Male    95    88    92               45               40
## 2      John   Male    85    60    75               35                8
## 3       Sam Female    78    75    80               15               16
## 4     Jenna Female    92    94    84               60               65
## 5      Sara Female    97    98    95               40               50
## 6     Jacob   Male    50    40     0                5                2
## 7   Melinda Female     0    90    92                0               47
## 8     Billy   Male    78    80    81               15               25
## 9     Kayla Female   100   100   100               40               40
## 10     Nick   Male    90    94    94               35               32
## 11 Nicolete Female    75    80    85               20               20
##    TimeStudiedTest3
## 1                50
## 2                10
## 3                17
## 4                60
## 5                45
## 6                 0
## 7                55
## 8                36
## 9                45
## 10               30
## 11               23

Calculate the Average scores and Time studied.

test_score <- mutate(test_data1,
  Avg_score = round((Test1 + Test2 + Test3)/3,0),
  Avg_studytime = round((TimeStudiedTest1 + TimeStudiedTest2 + TimeStudiedTest3)/3,0))
test_score

##     Student Gender Test1 Test2 Test3 TimeStudiedTest1 TimeStudiedTest2
## 1       Bob   Male    95    88    92               45               40
## 2      John   Male    85    60    75               35                8
## 3       Sam Female    78    75    80               15               16
## 4     Jenna Female    92    94    84               60               65
## 5      Sara Female    97    98    95               40               50
## 6     Jacob   Male    50    40     0                5                2
## 7   Melinda Female     0    90    92                0               47
## 8     Billy   Male    78    80    81               15               25
## 9     Kayla Female   100   100   100               40               40
## 10     Nick   Male    90    94    94               35               32
## 11 Nicolete Female    75    80    85               20               20
##    TimeStudiedTest3 Avg_score Avg_studytime
## 1                50        92            45
## 2                10        73            18
## 3                17        78            16
## 4                60        90            62
## 5                45        97            45
## 6                 0        30             2
## 7                55        61            34
## 8                36        80            25
## 9                45       100            42
## 10               30        93            32
## 11               23        80            21

Select the needed Column

test_score <- select(test_score, Student, Gender, Avg_score, Avg_studytime)
test_score

##     Student Gender Avg_score Avg_studytime
## 1       Bob   Male        92            45
## 2      John   Male        73            18
## 3       Sam Female        78            16
## 4     Jenna Female        90            62
## 5      Sara Female        97            45
## 6     Jacob   Male        30             2
## 7   Melinda Female        61            34
## 8     Billy   Male        80            25
## 9     Kayla Female       100            42
## 10     Nick   Male        93            32
## 11 Nicolete Female        80            21

Observe the Plot of Average score against studied Time.

scorevtime <- ggplot(data = test_score, aes(Avg_studytime, Avg_score)) + geom_point() +
  geom_smooth(se = FALSE) + ylab("Average Score") + xlab("Average Study Time") + theme_bw() + 
  labs(title = "Average Test Score vs Average Study Time")
scorevtime

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

The correlation between them is 0.7278

x <- test_score$Avg_studytime
y <- test_score$Avg_score
cor.test(x, y)

## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 3.1843, df = 9, p-value = 0.01111
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2271646 0.9242026
## sample estimates:
##       cor 
## 0.7278546

Untidy Data 2: Submitted by me (Charles Ugiagbe)

Read the Data

url2 <- "https://raw.githubusercontent.com/omocharly/DATA607_PROJECTS/main/Untidy%20Data%20%20-%20Charles%20Ugiagbe.csv"
Exam_score <- read.csv(url2)
head(Exam_score)

##         Timestamp Entering.Grade.Level District Birth.Month OLSAT.Verbal.Score
## 1   4/8/2017 6:44                    1        6   September                 28
## 2  4/7/2017 10:40                    K       NA      August                 25
## 3  4/7/2017 10:41                    1       NA       March                 27
## 4  4/7/2017 10:43                    K       NA   September                 23
## 5 4/10/2017 10:18                    K       22       April                 25
## 6  4/7/2017 11:03                    K       NA         May                 24
##   OLSAT.Verbal.Percentile NNAT.Non.Verbal.Raw.Score NNAT.Non.Verbal.Percentile
## 1                      99                        45                         99
## 2                      99                        39                         99
## 3                      96                        42                         99
## 4                      97                        40                         99
## 5                      98                        38                         99
## 6                      97                        36                         98
##   Overall.Score          School.Preferences
## 1            99 NEST+m, TAG, Anderson, Q300
## 2            99            Anderson, NEST+m
## 3            98                            
## 4            98                            
## 5            99  Brooklyn School of Inquiry
## 6            98                            
##                            School.Assigned Will.you.enroll.there.
## 1                                     NEST                    YES
## 2                                                           Maybe
## 3                                                           Maybe
## 4                                                                
## 5 Currently - local Brooklyn dual language                  Maybe
## 6

Clean the Data; Extract the Date from the Timestamp

Exam_score$date_component <- as.Date(Exam_score$Timestamp,'%d/%m/%Y')
glimpse(Exam_score)

## Rows: 117
## Columns: 13
## $ Timestamp                  <chr> "4/8/2017 6:44", "4/7/2017 10:40", "4/7/201~
## $ Entering.Grade.Level       <chr> "1", "K", "1", "K", "K", "K", "K", "K", "1"~
## $ District                   <int> 6, NA, NA, NA, 22, NA, NA, NA, NA, NA, NA, ~
## $ Birth.Month                <chr> "September", "August", "March", "September"~
## $ OLSAT.Verbal.Score         <chr> "28", "25", "27", "23", "25", "24", "26", "~
## $ OLSAT.Verbal.Percentile    <chr> "99", "99", "96", "97", "98", "97", "99", "~
## $ NNAT.Non.Verbal.Raw.Score  <chr> "45", "39", "42", "40", "38", "36", "42", "~
## $ NNAT.Non.Verbal.Percentile <int> 99, 99, 99, 99, 99, 98, 99, 99, 99, 99, 99,~
## $ Overall.Score              <int> 99, 99, 98, 98, 99, 98, 99, 99, 95, 99, 94,~
## $ School.Preferences         <chr> "NEST+m, TAG, Anderson, Q300", "Anderson, N~
## $ School.Assigned            <chr> "NEST", "", "", "", "Currently - local Broo~
## $ Will.you.enroll.there.     <chr> "YES", "Maybe", "Maybe", "", "Maybe", "", "~
## $ date_component             <date> 2017-08-04, 2017-07-04, 2017-07-04, 2017-0~

Select the Column of interest

Exam_score <- select(Exam_score, date_component, Entering.Grade.Level, Birth.Month, OLSAT.Verbal.Score, NNAT.Non.Verbal.Raw.Score)
head(Exam_score)

##   date_component Entering.Grade.Level Birth.Month OLSAT.Verbal.Score
## 1     2017-08-04                    1   September                 28
## 2     2017-07-04                    K      August                 25
## 3     2017-07-04                    1       March                 27
## 4     2017-07-04                    K   September                 23
## 5     2017-10-04                    K       April                 25
## 6     2017-07-04                    K         May                 24
##   NNAT.Non.Verbal.Raw.Score
## 1                        45
## 2                        39
## 3                        42
## 4                        40
## 5                        38
## 6                        36

Exam_score1 = Exam_score %>% rename(
      Date = date_component,
      Entering_Grade_lvl = Entering.Grade.Level,
      Birth_month = Birth.Month,
      Verbal_score = OLSAT.Verbal.Score,
      Non_verbal_score = NNAT.Non.Verbal.Raw.Score)
Exam_score1

##           Date Entering_Grade_lvl Birth_month    Verbal_score Non_verbal_score
## 1   2017-08-04                  1   September              28               45
## 2   2017-07-04                  K      August              25               39
## 3   2017-07-04                  1       March              27               42
## 4   2017-07-04                  K   September              23               40
## 5   2017-10-04                  K       April              25               38
## 6   2017-07-04                  K         May              24               36
## 7   2017-07-04                  K    February              26               42
## 8   2017-07-04                  K     October              24               42
## 9   2017-07-04                  1       March              23               42
## 10  2017-07-04                  2       April              29               44
## 11  2017-07-04                  K        June              17               39
## 12  2017-07-04                  K    December              23               45
## 13  2017-07-04                  1        July              24               39
## 14  2017-07-04                  1         May              29               48
## 15  2017-07-04                  1     January              28               43
## 16  2017-07-04                  K     January              26               40
## 17  2017-07-04                  1     January              26               46
## 18  2017-07-04                  1        June              25               41
## 19  2017-08-04                  K    December              21               43
## 20  2017-08-04                  K      August              25               39
## 21  2017-08-04                  K         May              28               46
## 22  2017-08-04                  K        July              28               38
## 23  2017-08-04                  2    February              28               44
## 24  2017-08-04                  1    November              28               41
## 25  2017-08-04                  1     January              26               42
## 26  2017-08-04                  2     January              28               43
## 27  2017-08-04                  1       March              28               45
## 28  2017-08-04                  2    November              28               47
## 29  2017-08-04                  1     January              26               46
## 30  2017-08-04                  K    February              27               46
## 31  2017-09-04                  K         May              27               45
## 32  2017-10-04                  K       March              32               40
## 33  2017-10-04                  K         May              19               40
## 34  2017-10-04                  K         May           24/30            36/48
## 35  2017-10-04                  K      August              13               27
## 36  2017-10-04                  K      August              29               37
## 37  2017-11-04                  K        July              18               38
## 38  2017-11-04                  K    December           23/30            40/48
## 39  2017-12-04                  K    November              18               45
## 40  2017-12-04                  K        July              22               41
## 41        <NA>                  K       April              23               39
## 42        <NA>                  K       March           17/30            36/48
## 43        <NA>                  1     January              24               42
## 44        <NA>                  K     October              26               35
## 45        <NA>                  K    February              26               40
## 46        <NA>                  1    December              27               43
## 47        <NA>                  1     January              28               46
## 48        <NA>                  K       March              24               44
## 49        <NA>                  1    December              29               45
## 50        <NA>                  K   September              28               42
## 51        <NA>                  1       March              29               44
## 52        <NA>                  K    February              20               33
## 53        <NA>                  2    December              28               44
## 54        <NA>                  K       March              25               40
## 55        <NA>                  K     January              24               45
## 56        <NA>                  K       April              24               40
## 57        <NA>                  K        July              24               34
## 58        <NA>                  K    February              23               37
## 59        <NA>                  K        July              27               38
## 60        <NA>                  K     October              22               34
## 61        <NA>                  K         May              25               36
## 62        <NA>                  K       March              26               45
## 63        <NA>                  1     January              26               39
## 64        <NA>                  K       March              28               43
## 65        <NA>                  1       April              23               45
## 66        <NA>                  1         May              29               48
## 67        <NA>                  2        July              29               40
## 68        <NA>                  K     January              24               40
## 69        <NA>                  K   September              99               99
## 70        <NA>                  K     January              25               41
## 71        <NA>                  K     January              19               38
## 72        <NA>                  1         May              29               44
## 73        <NA>                  3     October              90               90
## 74        <NA>                  K    November              20               39
## 75        <NA>                  K       March              21               39
## 76        <NA>                  2    November              28               47
## 77        <NA>                  3       March              30               39
## 78  2017-03-05                  K    December              23               37
## 79  2017-03-05                  K    December              23               37
## 80  2017-03-05                  1    February              21               42
## 81  2017-04-05                  K     January           24/30            39/48
## 82  2017-05-05                  K         May              35               40
## 83  2017-05-05                  K       March              22               40
## 84  2017-07-05                  1     October              28               41
## 85  2017-08-05                  K      August Fill out later.  Fill out later.
## 86  2017-09-05                  2         May              28               37
## 87  2017-10-05                  K        July           19/30            41/48
## 88        <NA>                  1    November              29               44
## 89        <NA>                  K    December              26               46
## 90        <NA>                  K    February              23               40
## 91        <NA>                  K    February              25               41
## 92        <NA>                  K     October              30               43
## 93        <NA>                  K    February              30               42
## 94  2017-01-06                  K        June              24               43
## 95  2017-01-06                  K         May              25               36
## 96  2017-01-06                  K       April              25               38
## 97  2017-01-06                  K     January              23               38
## 98  2017-01-06                  2     October              28               47
## 99  2017-01-06                  K        July              83               99
## 100 2017-01-06                  K         May              26               44
## 101 2017-01-06                  2       April              29               44
## 102 2017-01-06                  1    December              27               43
## 103 2017-01-06                  K       April              29               42
## 104 2017-01-06                  K     January              28               41
## 105 2017-02-06                  1       March              **               **
## 106 2017-02-06                  K        July              24               34
## 107 2017-02-06                  K       April              23               39
## 108 2017-04-06                  K       April              27               37
## 109 2017-04-06                  1    November               -                -
## 110 2017-06-06                  K       April              25               39
## 111 2017-06-06                  K     January              24               45
## 112 2017-07-06                  K    December              99               99
## 113       <NA>                  1      August              30               44
## 114       <NA>                  K        July              97               97
## 115       <NA>                  K    February              26               40
## 116       <NA>                  K       April              19               33
## 117 2019-11-01                  K     January              24               44

Take the Plot of Verbal Vs Non-Verbal

Exam_score2 <- ggplot(data = Exam_score1, aes(as.numeric(Verbal_score), as.numeric(Non_verbal_score))) + geom_point() +
  geom_smooth(se = FALSE) + ylab("Verbal Score") + xlab("Non Verbal Score") + theme_bw() + 
  labs(title = "Verbal Score vs Non Verbal Score")
Exam_score2

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Warning: Removed 8 rows containing non-finite values (stat_smooth).

## Warning: Removed 8 rows containing missing values (geom_point).

Untidy Data 3: Submitted by Mauricio Claudio

url3 <- "https://raw.githubusercontent.com/omocharly/DATA607_PROJECTS/main/Untidy%20Data%20-%20by%20Mauricio%20Claudio.csv"
churn_data <- read.csv(url3)
churn_data

##   Division Description Jan Feb Mar Apr May Jun  Jul Aug Sep Oct Nov Dec
## 1        A      Gained  70  80 100 110  70  45   50  99 112  99  55 110
## 2                 Lost   0 -90 -30 -45 -95 -33 -110 -34 -34 -88 -65 -45
## 3        B      Gained  80  80  90 120 100 119   75 119  90  80  80  90
## 4                 Lost   0 -15 -30 -25 -50 -77  -45 -77 -30 -15 -20 -30
## 5        C      Gained  60  85  80  90 120  45   75  45  80  85  60  80
## 6                 Lost   0 -45 -27 -17 -33 -80  -45 -80 -27 -45 -35 -27

Clean and Tidy the Data

churn_data[churn_data==""]=NA
churn_new = churn_data %>%
  select(-Description) %>%
  fill("Division") %>%
  pivot_longer(c(Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec),
  names_to="Month",
  values_to = "Customers",
  names_transform = list(Customers=as.integer)
  )

churn_new

## # A tibble: 72 x 3
##    Division Month Customers
##    <chr>    <chr>     <int>
##  1 A        Jan          70
##  2 A        Feb          80
##  3 A        Mar         100
##  4 A        Apr         110
##  5 A        May          70
##  6 A        Jun          45
##  7 A        Jul          50
##  8 A        Aug          99
##  9 A        Sep         112
## 10 A        Oct          99
## # ... with 62 more rows

To test the most Effective Division using the ratio by Substracting loss from gain

churn_new %>% group_by(Division) %>%
  summarize(Net_Customers=sum(Customers)) %>%
  ggplot(aes(Division,Net_Customers)) + geom_col()

To show which month Generate the Most Customers

churn_new %>%
  group_by(Month) %>%
  summarize(Net_Customers=sum(Customers)) %>%
  ggplot(aes(Month,Net_Customers)) + geom_col()