DATA607 Project 2

Gehad Gad

March 8th, 2020

DATA 607 Project 2

In this project, we are asked to choose any three of the “wide” datasets identified in the Week 5 Discussion items. For each of the three chosen datasets:

1.Create a .CSV file (or optionally, a MySQL database!) that includes all of the information included in the dataset. You’re encouraged to use a “wide” structure similar to how the information appears in the discussion item, so that you can practice tidying and transformations as described below.

2.Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data. [Most of your grade will be based on this step!]

3.Perform the analysis requested in the discussion item.

#Import libraries and/or Packages
library(ggplot2)
library (tidyr)

## Warning: package 'tidyr' was built under R version 3.6.2

library (dplyr)

## Warning: package 'dplyr' was built under R version 3.6.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(funModeling)

## Warning: package 'funModeling' was built under R version 3.6.3

## Loading required package: Hmisc

## Warning: package 'Hmisc' was built under R version 3.6.3

## Loading required package: lattice

## Loading required package: survival

## Warning: package 'survival' was built under R version 3.6.3

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

## funModeling v.1.9.3 :)
## Examples and tutorials at livebook.datascienceheroes.com
##  / Now in Spanish: librovivodecienciadedatos.ai

library(fastDummies)

Dataset 1:

This dataset is include information about student performance during three terms for two tests.

#Import the data from GitHub:

#Data1 <- read.csv(file="https://github.com/GehadGad/Project2-data3/raw/master/Studentlist.csv")


#Import the data from csv file:
Data1 <- read.csv ("Studentlist.csv")

#Display the data.
head(Data1)

##   id   name phone sex.and.age test.number term.1 term.2 term.3
## 1  1   Mike   134        m_12      test 1     76     84     87
## 2  2  Linda   270        f_13      test 1     88     90     73
## 3  3    Sam   210        m_11      test 1     78     74     80
## 4  4 Esther   617        f_12      test 1     68     75     74
## 5  5   Mary   114        f_14      test 1     65     67     64
## 6  1   Mike   134        m_12      test 2     85     80     90

#Create a subset for the data and remove the (id) column.
Data1 = subset(Data1, select = -c(id) )


#Separate the column (sex.and.age) to two separate columns (sex) and (age)
Data1_tidy <-separate(Data1, sex.and.age, into = c("Sex", "Age"))

#Gather all the terms in one columns.
Data1_tidy <- gather(Data1_tidy, Terms, score, 6:8)


#Display the data.
head(Data1_tidy)

##     name phone Sex Age test.number  Terms score
## 1   Mike   134   m  12      test 1 term.1    76
## 2  Linda   270   f  13      test 1 term.1    88
## 3    Sam   210   m  11      test 1 term.1    78
## 4 Esther   617   f  12      test 1 term.1    68
## 5   Mary   114   f  14      test 1 term.1    65
## 6   Mike   134   m  12      test 2 term.1    85

#Factorizing the columns (Sex and test.number) to zero and one by using dummy variable:

Newdata= fastDummies::dummy_cols(Data1_tidy, select_columns="Sex", remove_selected_columns = TRUE, remove_first_dummy = TRUE)

Newdata= fastDummies::dummy_cols(Newdata, select_columns="test.number", remove_selected_columns = TRUE, remove_first_dummy = TRUE)

Newdata= fastDummies::dummy_cols(Newdata, select_columns="Terms", remove_selected_columns = TRUE)

#Rename the two columns:
names(Newdata)[5]="Sex"
names(Newdata)[6]="Test"

#Change the phone column type to factor:

Newdata$phone = as.factor (Newdata$phone)

#Change the Age column type to numeric:

Newdata$Age = as.numeric (Newdata$Age)

#Display the Newdata after the changes:
head(Newdata)

##     name phone Age score Sex Test Terms_term.1 Terms_term.2 Terms_term.3
## 1   Mike   134  12    76   1    0            1            0            0
## 2  Linda   270  13    88   0    0            1            0            0
## 3    Sam   210  11    78   1    0            1            0            0
## 4 Esther   617  12    68   0    0            1            0            0
## 5   Mary   114  14    65   0    0            1            0            0
## 6   Mike   134  12    85   1    1            1            0            0

hist(Newdata$Age)

hist(Newdata$score)

Dataset 2:

This dataset shows the relationship between religion and income.

#Import the data from GitHub:

#Data2<- read.csv(file="https://github.com/GehadGad/Project2-data1/raw/master/Income.religion.csv", header = TRUE)


#Import the data from csv file:
Data2 <- read.csv ("Income.religion.csv")

#Display the data.

head(Data2)

##           religion X.10k X10.20k X20.30k X30.40k X40.50k X50.75k X75.100k
## 1         Agnostic    27      34      60      81      76     137      122
## 2          Atheist    12      27      37      52      35      70       73
## 3         Buddhist    27      21      30      34      33      58       62
## 4         Catholic   418     617     732     670     638    1116      949
## 5          refused    15      14      15      11      10      35       21
## 6 Evangelical Prot   575     869    1064     982     881    1486      949
##   X100.150k X.150k refused
## 1       109     84      96
## 2        59     74      76
## 3        39     53      54
## 4       792    633    1489
## 5        17     18     116
## 6       723    414    1529

#Rename the columns names:

names(Data2)= c("Religion", "lessThan10k","From10To20k", "From20To30k", "From30To40k", "From40To50k","From50To75k", "From75To100k", "From100To150k", "greaterThan150k", "Refused")

#Display the data:
head(Data2)

##           Religion lessThan10k From10To20k From20To30k From30To40k
## 1         Agnostic          27          34          60          81
## 2          Atheist          12          27          37          52
## 3         Buddhist          27          21          30          34
## 4         Catholic         418         617         732         670
## 5          refused          15          14          15          11
## 6 Evangelical Prot         575         869        1064         982
##   From40To50k From50To75k From75To100k From100To150k greaterThan150k
## 1          76         137          122           109              84
## 2          35          70           73            59              74
## 3          33          58           62            39              53
## 4         638        1116          949           792             633
## 5          10          35           21            17              18
## 6         881        1486          949           723             414
##   Refused
## 1      96
## 2      76
## 3      54
## 4    1489
## 5     116
## 6    1529

summary (Data2)

##              Religion   lessThan10k      From10To20k      From20To30k    
##  Agnostic        : 1   Min.   :  1.00   Min.   :  2.00   Min.   :   3.0  
##  Atheist         : 1   1st Qu.: 12.25   1st Qu.: 14.75   1st Qu.:  17.0  
##  Buddhist        : 1   Median : 20.00   Median : 27.00   Median :  33.5  
##  Catholic        : 1   Mean   :107.22   Mean   :154.50   Mean   : 186.5  
##  Evangelical Prot: 1   3rd Qu.:170.00   3rd Qu.:193.00   3rd Qu.: 192.0  
##  Hindu           : 1   Max.   :575.00   Max.   :869.00   Max.   :1064.0  
##  (Other)         :12                                                     
##   From30To40k      From40To50k     From50To75k       From75To100k   
##  Min.   :  4.00   Min.   :  2.0   Min.   :   7.00   Min.   :  3.00  
##  1st Qu.: 15.75   1st Qu.: 15.0   1st Qu.:  34.25   1st Qu.: 25.25  
##  Median : 40.00   Median : 34.0   Median :  66.50   Median : 65.50  
##  Mean   :183.44   Mean   :171.4   Mean   : 288.06   Mean   :221.67  
##  3rd Qu.:198.75   3rd Qu.:166.8   3rd Qu.: 201.50   3rd Qu.:128.75  
##  Max.   :982.00   Max.   :881.0   Max.   :1486.00   Max.   :949.00  
##                                                                     
##  From100To150k   greaterThan150k     Refused       
##  Min.   :  4.0   Min.   :  4.00   Min.   :   8.00  
##  1st Qu.: 22.5   1st Qu.: 23.75   1st Qu.:  41.25  
##  Median : 48.5   Median : 53.50   Median :  74.50  
##  Mean   :177.6   Mean   :144.89   Mean   : 340.06  
##  3rd Qu.:103.5   3rd Qu.:134.25   3rd Qu.: 294.75  
##  Max.   :792.0   Max.   :634.00   Max.   :1529.00  
##

The summary displays the minimum, mean, median and maximum for each column. It also display the 1st and 3rd quarter.

#Create a new dataframe called Data1_gather to collect and tide all the salaries in one column.

Data2_tidy <- gather(Data2, Income, Frequency, 2:11)

#Display the data.
head(Data2_tidy)

##           Religion      Income Frequency
## 1         Agnostic lessThan10k        27
## 2          Atheist lessThan10k        12
## 3         Buddhist lessThan10k        27
## 4         Catholic lessThan10k       418
## 5          refused lessThan10k        15
## 6 Evangelical Prot lessThan10k       575

for (i in names(Data2)) {
xlim(0,15)
p = ggplot(Data2, aes_string(x='Religion', y= i))+geom_bar(stat = "identity")
show(p)

}

#The graphs below shows each column for all the religion:

Dataset 3:

Climate New York-La Guardia Arpt - New York

#Import the data from GitHub:

#Data3 <- read.csv (file="https://github.com/GehadGad/Project2-data2/raw/master/Climate%20NY.csv")

#Import the data from csv file:
Data3 <- read.csv ("ClimateNY.csv")


#Display the data.

head(Data3)

##                     Climate   Jan   Feb   Mar Apr   May   Jun  Jul   Aug
## 1         Average high in F 39.00 42.00 50.00  61 71.00 80.00 85.0 84.00
## 2          Average low in F 27.00 29.00 35.00  44 54.00 64.00 69.0 69.00
## 3 Av. Precipitation in inch  3.17  2.76  3.97   4  3.79  3.94  4.5  4.12
## 4      Av. Snowfall in inch  7.00  9.00  4.00   1  0.00  0.00  0.0  0.00
##     Sep   Oct   Nov   Dec
## 1 76.00 65.00 55.00 44.00
## 2 62.00 51.00 42.00 32.00
## 3  3.73  3.78  3.41  3.56
## 4  0.00  0.00  0.00  5.00

#Change the table shape (transpose). This is because the months seems to be observation, not a feature.

Data3_tidy=data.frame(t(Data3))

#Rename the header
names(Data3_tidy)= c ("AverageHighInF","AverageLowInF", "AvPrecipitationInInch", "AvSnowfallInInch")

#Delete a row
Data3_tidy= Data3_tidy[2:13,]

#Name the index to MONTH:
Data3_tidy= tibble::rownames_to_column(Data3_tidy, "Month")
  
#Display the date:
head (Data3_tidy)

##   Month AverageHighInF AverageLowInF AvPrecipitationInInch
## 1   Jan          39.00         27.00                  3.17
## 2   Feb          42.00         29.00                  2.76
## 3   Mar          50.00         35.00                  3.97
## 4   Apr             61            44                     4
## 5   May          71.00         54.00                  3.79
## 6   Jun          80.00         64.00                  3.94
##   AvSnowfallInInch
## 1             7.00
## 2             9.00
## 3             4.00
## 4                1
## 5             0.00
## 6             0.00

#Change the columns to be numeric

Data3_tidy$AverageHighInF<- as.numeric (as.character (Data3_tidy$AverageHighInF))
Data3_tidy$AverageLowInF <- as.numeric(as.character (Data3_tidy$AverageLowInF))
Data3_tidy$AvPrecipitationInInch <- as.numeric(as.character (Data3_tidy$AvPrecipitationInInch))
Data3_tidy$AvSnowfallInInch <- as.numeric(as.character (Data3_tidy$AvSnowfallInInch))

summary (Data3_tidy)

##     Month           AverageHighInF  AverageLowInF   AvPrecipitationInInch
##  Length:12          Min.   :39.00   Min.   :27.00   Min.   :2.760        
##  Class :character   1st Qu.:48.50   1st Qu.:34.25   1st Qu.:3.522        
##  Mode  :character   Median :63.00   Median :47.50   Median :3.785        
##                     Mean   :62.67   Mean   :48.17   Mean   :3.728        
##                     3rd Qu.:77.00   3rd Qu.:62.50   3rd Qu.:3.978        
##                     Max.   :85.00   Max.   :69.00   Max.   :4.500        
##  AvSnowfallInInch
##  Min.   :0.000   
##  1st Qu.:0.000   
##  Median :0.000   
##  Mean   :2.167   
##  3rd Qu.:4.250   
##  Max.   :9.000

#Find the correlation between AverageHighInF and AverageLowInF.

cor(Data3_tidy$AverageHighInF,Data3_tidy$AverageLowInF)

## [1] 0.9964071

The Correlation shows how strongly the variables are related. The correlation ranges from -1.0 to +1.0. The closer the correlation (r) to +1 or -1, the more closely the two variables are related. The cor between (AverageHighInF) and (AverageLowInF) is very high and close to 1.

#Find the correlation between AverageLowInF and AvPrecipitationInInch.
cor(Data3_tidy$AvPrecipitationInInch,Data3_tidy$AverageLowInF)

## [1] 0.7510468

The cor between (AvPrecipitationInInch) and (AverageLowInF) is good enough and close to 1.

#Graphs

for (i in names(Data3_tidy)){
show(ggplot(data = Data3_tidy, aes_string(x = "Month", y = i))+ geom_bar(stat="identity", position="dodge") + ggtitle("Climate change by Month") + ylab(i))

  
}