Gehad Gad
March 8th, 2020
DATA 607 Project 2
In this project, we are asked to choose any three of the “wide” datasets identified in the Week 5 Discussion items. For each of the three chosen datasets:
1.Create a .CSV file (or optionally, a MySQL database!) that includes all of the information included in the dataset. You’re encouraged to use a “wide” structure similar to how the information appears in the discussion item, so that you can practice tidying and transformations as described below.
2.Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data. [Most of your grade will be based on this step!]
3.Perform the analysis requested in the discussion item.
#Import libraries and/or Packages
library(ggplot2)
library (tidyr)
## Warning: package 'tidyr' was built under R version 3.6.2
library (dplyr)
## Warning: package 'dplyr' was built under R version 3.6.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(funModeling)
## Warning: package 'funModeling' was built under R version 3.6.3
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 3.6.3
## Loading required package: lattice
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.6.3
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.9.3 :)
## Examples and tutorials at livebook.datascienceheroes.com
## / Now in Spanish: librovivodecienciadedatos.ai
library(fastDummies)
Dataset 1:
This dataset is include information about student performance during three terms for two tests.
#Import the data from GitHub:
#Data1 <- read.csv(file="https://github.com/GehadGad/Project2-data3/raw/master/Studentlist.csv")
#Import the data from csv file:
Data1 <- read.csv ("Studentlist.csv")
#Display the data.
head(Data1)
## id name phone sex.and.age test.number term.1 term.2 term.3
## 1 1 Mike 134 m_12 test 1 76 84 87
## 2 2 Linda 270 f_13 test 1 88 90 73
## 3 3 Sam 210 m_11 test 1 78 74 80
## 4 4 Esther 617 f_12 test 1 68 75 74
## 5 5 Mary 114 f_14 test 1 65 67 64
## 6 1 Mike 134 m_12 test 2 85 80 90
#Create a subset for the data and remove the (id) column.
Data1 = subset(Data1, select = -c(id) )
#Separate the column (sex.and.age) to two separate columns (sex) and (age)
Data1_tidy <-separate(Data1, sex.and.age, into = c("Sex", "Age"))
#Gather all the terms in one columns.
Data1_tidy <- gather(Data1_tidy, Terms, score, 6:8)
#Display the data.
head(Data1_tidy)
## name phone Sex Age test.number Terms score
## 1 Mike 134 m 12 test 1 term.1 76
## 2 Linda 270 f 13 test 1 term.1 88
## 3 Sam 210 m 11 test 1 term.1 78
## 4 Esther 617 f 12 test 1 term.1 68
## 5 Mary 114 f 14 test 1 term.1 65
## 6 Mike 134 m 12 test 2 term.1 85
#Factorizing the columns (Sex and test.number) to zero and one by using dummy variable:
Newdata= fastDummies::dummy_cols(Data1_tidy, select_columns="Sex", remove_selected_columns = TRUE, remove_first_dummy = TRUE)
Newdata= fastDummies::dummy_cols(Newdata, select_columns="test.number", remove_selected_columns = TRUE, remove_first_dummy = TRUE)
Newdata= fastDummies::dummy_cols(Newdata, select_columns="Terms", remove_selected_columns = TRUE)
#Rename the two columns:
names(Newdata)[5]="Sex"
names(Newdata)[6]="Test"
#Change the phone column type to factor:
Newdata$phone = as.factor (Newdata$phone)
#Change the Age column type to numeric:
Newdata$Age = as.numeric (Newdata$Age)
#Display the Newdata after the changes:
head(Newdata)
## name phone Age score Sex Test Terms_term.1 Terms_term.2 Terms_term.3
## 1 Mike 134 12 76 1 0 1 0 0
## 2 Linda 270 13 88 0 0 1 0 0
## 3 Sam 210 11 78 1 0 1 0 0
## 4 Esther 617 12 68 0 0 1 0 0
## 5 Mary 114 14 65 0 0 1 0 0
## 6 Mike 134 12 85 1 1 1 0 0
hist(Newdata$Age)
hist(Newdata$score)
Dataset 2:
This dataset shows the relationship between religion and income.
#Import the data from GitHub:
#Data2<- read.csv(file="https://github.com/GehadGad/Project2-data1/raw/master/Income.religion.csv", header = TRUE)
#Import the data from csv file:
Data2 <- read.csv ("Income.religion.csv")
#Display the data.
head(Data2)
## religion X.10k X10.20k X20.30k X30.40k X40.50k X50.75k X75.100k
## 1 Agnostic 27 34 60 81 76 137 122
## 2 Atheist 12 27 37 52 35 70 73
## 3 Buddhist 27 21 30 34 33 58 62
## 4 Catholic 418 617 732 670 638 1116 949
## 5 refused 15 14 15 11 10 35 21
## 6 Evangelical Prot 575 869 1064 982 881 1486 949
## X100.150k X.150k refused
## 1 109 84 96
## 2 59 74 76
## 3 39 53 54
## 4 792 633 1489
## 5 17 18 116
## 6 723 414 1529
#Rename the columns names:
names(Data2)= c("Religion", "lessThan10k","From10To20k", "From20To30k", "From30To40k", "From40To50k","From50To75k", "From75To100k", "From100To150k", "greaterThan150k", "Refused")
#Display the data:
head(Data2)
## Religion lessThan10k From10To20k From20To30k From30To40k
## 1 Agnostic 27 34 60 81
## 2 Atheist 12 27 37 52
## 3 Buddhist 27 21 30 34
## 4 Catholic 418 617 732 670
## 5 refused 15 14 15 11
## 6 Evangelical Prot 575 869 1064 982
## From40To50k From50To75k From75To100k From100To150k greaterThan150k
## 1 76 137 122 109 84
## 2 35 70 73 59 74
## 3 33 58 62 39 53
## 4 638 1116 949 792 633
## 5 10 35 21 17 18
## 6 881 1486 949 723 414
## Refused
## 1 96
## 2 76
## 3 54
## 4 1489
## 5 116
## 6 1529
summary (Data2)
## Religion lessThan10k From10To20k From20To30k
## Agnostic : 1 Min. : 1.00 Min. : 2.00 Min. : 3.0
## Atheist : 1 1st Qu.: 12.25 1st Qu.: 14.75 1st Qu.: 17.0
## Buddhist : 1 Median : 20.00 Median : 27.00 Median : 33.5
## Catholic : 1 Mean :107.22 Mean :154.50 Mean : 186.5
## Evangelical Prot: 1 3rd Qu.:170.00 3rd Qu.:193.00 3rd Qu.: 192.0
## Hindu : 1 Max. :575.00 Max. :869.00 Max. :1064.0
## (Other) :12
## From30To40k From40To50k From50To75k From75To100k
## Min. : 4.00 Min. : 2.0 Min. : 7.00 Min. : 3.00
## 1st Qu.: 15.75 1st Qu.: 15.0 1st Qu.: 34.25 1st Qu.: 25.25
## Median : 40.00 Median : 34.0 Median : 66.50 Median : 65.50
## Mean :183.44 Mean :171.4 Mean : 288.06 Mean :221.67
## 3rd Qu.:198.75 3rd Qu.:166.8 3rd Qu.: 201.50 3rd Qu.:128.75
## Max. :982.00 Max. :881.0 Max. :1486.00 Max. :949.00
##
## From100To150k greaterThan150k Refused
## Min. : 4.0 Min. : 4.00 Min. : 8.00
## 1st Qu.: 22.5 1st Qu.: 23.75 1st Qu.: 41.25
## Median : 48.5 Median : 53.50 Median : 74.50
## Mean :177.6 Mean :144.89 Mean : 340.06
## 3rd Qu.:103.5 3rd Qu.:134.25 3rd Qu.: 294.75
## Max. :792.0 Max. :634.00 Max. :1529.00
##
The summary displays the minimum, mean, median and maximum for each column. It also display the 1st and 3rd quarter.
#Create a new dataframe called Data1_gather to collect and tide all the salaries in one column.
Data2_tidy <- gather(Data2, Income, Frequency, 2:11)
#Display the data.
head(Data2_tidy)
## Religion Income Frequency
## 1 Agnostic lessThan10k 27
## 2 Atheist lessThan10k 12
## 3 Buddhist lessThan10k 27
## 4 Catholic lessThan10k 418
## 5 refused lessThan10k 15
## 6 Evangelical Prot lessThan10k 575
for (i in names(Data2)) {
xlim(0,15)
p = ggplot(Data2, aes_string(x='Religion', y= i))+geom_bar(stat = "identity")
show(p)
}
#The graphs below shows each column for all the religion:
Dataset 3:
Climate New York-La Guardia Arpt - New York
#Import the data from GitHub:
#Data3 <- read.csv (file="https://github.com/GehadGad/Project2-data2/raw/master/Climate%20NY.csv")
#Import the data from csv file:
Data3 <- read.csv ("ClimateNY.csv")
#Display the data.
head(Data3)
## Climate Jan Feb Mar Apr May Jun Jul Aug
## 1 Average high in F 39.00 42.00 50.00 61 71.00 80.00 85.0 84.00
## 2 Average low in F 27.00 29.00 35.00 44 54.00 64.00 69.0 69.00
## 3 Av. Precipitation in inch 3.17 2.76 3.97 4 3.79 3.94 4.5 4.12
## 4 Av. Snowfall in inch 7.00 9.00 4.00 1 0.00 0.00 0.0 0.00
## Sep Oct Nov Dec
## 1 76.00 65.00 55.00 44.00
## 2 62.00 51.00 42.00 32.00
## 3 3.73 3.78 3.41 3.56
## 4 0.00 0.00 0.00 5.00
#Change the table shape (transpose). This is because the months seems to be observation, not a feature.
Data3_tidy=data.frame(t(Data3))
#Rename the header
names(Data3_tidy)= c ("AverageHighInF","AverageLowInF", "AvPrecipitationInInch", "AvSnowfallInInch")
#Delete a row
Data3_tidy= Data3_tidy[2:13,]
#Name the index to MONTH:
Data3_tidy= tibble::rownames_to_column(Data3_tidy, "Month")
#Display the date:
head (Data3_tidy)
## Month AverageHighInF AverageLowInF AvPrecipitationInInch
## 1 Jan 39.00 27.00 3.17
## 2 Feb 42.00 29.00 2.76
## 3 Mar 50.00 35.00 3.97
## 4 Apr 61 44 4
## 5 May 71.00 54.00 3.79
## 6 Jun 80.00 64.00 3.94
## AvSnowfallInInch
## 1 7.00
## 2 9.00
## 3 4.00
## 4 1
## 5 0.00
## 6 0.00
#Change the columns to be numeric
Data3_tidy$AverageHighInF<- as.numeric (as.character (Data3_tidy$AverageHighInF))
Data3_tidy$AverageLowInF <- as.numeric(as.character (Data3_tidy$AverageLowInF))
Data3_tidy$AvPrecipitationInInch <- as.numeric(as.character (Data3_tidy$AvPrecipitationInInch))
Data3_tidy$AvSnowfallInInch <- as.numeric(as.character (Data3_tidy$AvSnowfallInInch))
summary (Data3_tidy)
## Month AverageHighInF AverageLowInF AvPrecipitationInInch
## Length:12 Min. :39.00 Min. :27.00 Min. :2.760
## Class :character 1st Qu.:48.50 1st Qu.:34.25 1st Qu.:3.522
## Mode :character Median :63.00 Median :47.50 Median :3.785
## Mean :62.67 Mean :48.17 Mean :3.728
## 3rd Qu.:77.00 3rd Qu.:62.50 3rd Qu.:3.978
## Max. :85.00 Max. :69.00 Max. :4.500
## AvSnowfallInInch
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :2.167
## 3rd Qu.:4.250
## Max. :9.000
#Find the correlation between AverageHighInF and AverageLowInF.
cor(Data3_tidy$AverageHighInF,Data3_tidy$AverageLowInF)
## [1] 0.9964071
The Correlation shows how strongly the variables are related. The correlation ranges from -1.0 to +1.0. The closer the correlation (r) to +1 or -1, the more closely the two variables are related. The cor between (AverageHighInF) and (AverageLowInF) is very high and close to 1.
#Find the correlation between AverageLowInF and AvPrecipitationInInch.
cor(Data3_tidy$AvPrecipitationInInch,Data3_tidy$AverageLowInF)
## [1] 0.7510468
The cor between (AvPrecipitationInInch) and (AverageLowInF) is good enough and close to 1.
#Graphs
for (i in names(Data3_tidy)){
show(ggplot(data = Data3_tidy, aes_string(x = "Month", y = i))+ geom_bar(stat="identity", position="dodge") + ggtitle("Climate change by Month") + ylab(i))
}