#Read data
qData <- read.csv(file="http://vincentarelbundock.github.io/Rdatasets/csv/boot/neuro.csv", header=TRUE, sep=",")
#Check the data
head(qData)
## X V1 V2 V3 V4 V5 V6
## 1 1 NA -203.7 -84.1 18.5 NA NA
## 2 2 NA -203.0 -97.8 25.8 134.7 NA
## 3 3 NA -249.0 -92.1 27.8 177.1 NA
## 4 4 NA -231.5 -97.5 27.0 150.3 NA
## 5 5 NA NA -130.1 25.8 160.0 NA
## 6 6 NA -223.1 -70.7 62.1 197.5 NA
1. Use the summary function to gain an overview of the data set. Then display the mean and median for at least two attributes.
summary(qData)
## X V1 V2 V3
## Min. : 1 Min. :-249.9 Min. :-249.0 Min. :-235.80
## 1st Qu.:118 1st Qu.:-241.6 1st Qu.:-216.6 1st Qu.: -97.10
## Median :235 Median :-234.9 Median :-190.6 Median : -67.90
## Mean :235 Mean :-232.7 Mean :-186.6 Mean : -66.16
## 3rd Qu.:352 3rd Qu.:-224.0 3rd Qu.:-156.2 3rd Qu.: -32.00
## Max. :469 Max. :-196.9 Max. : -98.6 Max. : -0.10
## NA's :429 NA's :51
## V4 V5 V6
## Min. : 1.00 Min. : 75.6 Min. :154.3
## 1st Qu.: 26.27 1st Qu.:134.2 1st Qu.:222.1
## Median : 29.70 Median :154.9 Median :235.7
## Mean : 48.07 Mean :162.5 Mean :229.7
## 3rd Qu.: 72.20 3rd Qu.:193.3 3rd Qu.:243.5
## Max. :197.80 Max. :249.3 Max. :249.7
## NA's :1 NA's :24 NA's :379
#Smooth data (replace NA with mean)
for(i in 1:ncol(qData)){
qData[is.na(qData[,i]), i] <- mean(qData[,i], na.rm = TRUE)
}
#Check the data
head(qData)
## X V1 V2 V3 V4 V5 V6
## 1 1 -232.72 -203.700 -84.1 18.5 162.4829 229.74
## 2 2 -232.72 -203.000 -97.8 25.8 134.7000 229.74
## 3 3 -232.72 -249.000 -92.1 27.8 177.1000 229.74
## 4 4 -232.72 -231.500 -97.5 27.0 150.3000 229.74
## 5 5 -232.72 -186.567 -130.1 25.8 160.0000 229.74
## 6 6 -232.72 -223.100 -70.7 62.1 197.5000 229.74
qData.mean.V1 <- mean(qData$V1)
print(qData.mean.V1)
## [1] -232.72
qData.mean.V2 <- mean(qData$V2)
print(qData.mean.V2)
## [1] -186.567
#Get fresh data to smooth median. We can also take a copy of the original for this calculation
qData <- read.csv(file="http://vincentarelbundock.github.io/Rdatasets/csv/boot/neuro.csv", header=TRUE, sep=",")
for(i in 1:ncol(qData)){
qData[is.na(qData[,i]), i] <- median(qData[,i], na.rm = TRUE)
}
#Check the data
head(qData)
## X V1 V2 V3 V4 V5 V6
## 1 1 -234.95 -203.70 -84.1 18.5 154.9 235.7
## 2 2 -234.95 -203.00 -97.8 25.8 134.7 235.7
## 3 3 -234.95 -249.00 -92.1 27.8 177.1 235.7
## 4 4 -234.95 -231.50 -97.5 27.0 150.3 235.7
## 5 5 -234.95 -190.55 -130.1 25.8 160.0 235.7
## 6 6 -234.95 -223.10 -70.7 62.1 197.5 235.7
qData.median.V1 <- median(qData$V1)
print(qData.median.V1)
## [1] -234.95
#Check data quality for V1
boxplot(qData$V1,ylab="Frequency",main="V1")

qData.median.V2 <- median(qData$V2)
print(qData.median.V2)
## [1] -190.55
#Check data quality for V2
boxplot(qData$V2,ylab="Frequency",main="V2")

2. Create a new data frame with a subset of the columns and rows. Make sure to rename it
#New subset
qDataNew <- qData[c(5:21),c(2,3,6:7)]
#Rename
names(qDataNew) <- c("VN1","VN2", "VN5","VN6")
#Check data
head(qDataNew)
## VN1 VN2 VN5 VN6
## 5 -234.95 -190.55 160.0 235.7
## 6 -234.95 -223.10 197.5 235.7
## 7 -234.95 -164.80 202.8 235.7
## 8 -234.95 -221.60 144.5 235.7
## 9 -234.95 -153.70 222.4 235.7
## 10 -234.95 -184.70 208.9 235.7
3. Create new column names for the new data frame.
qDataNew <- setNames(qDataNew, c("VNew1","VNew2","VNew5","VNew6"))
head(qDataNew)
## VNew1 VNew2 VNew5 VNew6
## 5 -234.95 -190.55 160.0 235.7
## 6 -234.95 -223.10 197.5 235.7
## 7 -234.95 -164.80 202.8 235.7
## 8 -234.95 -221.60 144.5 235.7
## 9 -234.95 -153.70 222.4 235.7
## 10 -234.95 -184.70 208.9 235.7
Comapre old and new data
qDataStats <- data.frame(Mean = qData.mean.V1, Median = qData.median.V1, row.names = "OldV1")
qDataStats <- rbind(qDataStats, data.frame(Mean = qDataNew.mean.VNew1, Median = qDataNew.median.VNew1, row.names = "NewV1"))
qDataStats <- rbind(qDataStats, data.frame(Mean = qData.mean.V2, Median = qData.median.V2, row.names = "OldV2"))
qDataStats <- rbind(qDataStats, data.frame(Mean = qDataNew.mean.VNew2, Median = qDataNew.median.VNew2, row.names = "NewV2"))
print(qDataStats)
## Mean Median
## OldV1 -232.7200 -234.95
## NewV1 -234.8706 -234.95
## OldV2 -186.5670 -190.55
## NewV2 -196.0441 -195.70
Compare: The new V1 mean and median is the same as the old. This could be because the datasmoothing and the size of the subset. This shows that large data set have more accurate results than the smaller one. The new V2 is different from old V2.
5. For at least 3 values in a column please rename so that every value in that column is renamed. For example, suppose I have 20 values of the letter “e” in one column. Rename those values so that all 20 would show as “excellent”.
#Create a row for this question. The existing rows are not of type text
qData$Rating <- "No Data"
head(qData)
## X V1 V2 V3 V4 V5 V6 Rating
## 1 1 -234.95 -203.70 -84.1 18.5 154.9 235.7 No Data
## 2 2 -234.95 -203.00 -97.8 25.8 134.7 235.7 No Data
## 3 3 -234.95 -249.00 -92.1 27.8 177.1 235.7 No Data
## 4 4 -234.95 -231.50 -97.5 27.0 150.3 235.7 No Data
## 5 5 -234.95 -190.55 -130.1 25.8 160.0 235.7 No Data
## 6 6 -234.95 -223.10 -70.7 62.1 197.5 235.7 No Data
#Populate value "excellent"
qData$Rating <- "excellent"
head(qData)
## X V1 V2 V3 V4 V5 V6 Rating
## 1 1 -234.95 -203.70 -84.1 18.5 154.9 235.7 excellent
## 2 2 -234.95 -203.00 -97.8 25.8 134.7 235.7 excellent
## 3 3 -234.95 -249.00 -92.1 27.8 177.1 235.7 excellent
## 4 4 -234.95 -231.50 -97.5 27.0 150.3 235.7 excellent
## 5 5 -234.95 -190.55 -130.1 25.8 160.0 235.7 excellent
## 6 6 -234.95 -223.10 -70.7 62.1 197.5 235.7 excellent
6. Display enough rows to see examples of all of steps 1 - 5 above.
qData[1:30,]
## X V1 V2 V3 V4 V5 V6 Rating
## 1 1 -234.95 -203.70 -84.1 18.5 154.9 235.7 excellent
## 2 2 -234.95 -203.00 -97.8 25.8 134.7 235.7 excellent
## 3 3 -234.95 -249.00 -92.1 27.8 177.1 235.7 excellent
## 4 4 -234.95 -231.50 -97.5 27.0 150.3 235.7 excellent
## 5 5 -234.95 -190.55 -130.1 25.8 160.0 235.7 excellent
## 6 6 -234.95 -223.10 -70.7 62.1 197.5 235.7 excellent
## 7 7 -234.95 -164.80 -12.2 76.8 202.8 235.7 excellent
## 8 8 -234.95 -221.60 -81.9 27.5 144.5 235.7 excellent
## 9 9 -234.95 -153.70 -17.0 76.1 222.4 235.7 excellent
## 10 10 -234.95 -184.70 -47.3 74.4 208.9 235.7 excellent
## 11 11 -234.95 -190.55 -148.8 11.4 137.7 235.7 excellent
## 12 12 -234.95 -197.60 -6.4 137.1 154.9 235.7 excellent
## 13 13 -234.95 -247.80 -35.4 80.9 229.5 235.7 excellent
## 14 14 -234.95 -227.00 -104.7 20.2 140.2 235.7 excellent
## 15 15 -233.60 -115.90 -10.5 70.0 202.6 235.7 excellent
## 16 16 -234.95 -232.40 -100.6 16.8 145.1 235.7 excellent
## 17 17 -234.95 -199.40 -58.2 29.1 184.4 235.7 excellent
## 18 18 -234.95 -195.70 -89.5 26.4 142.7 235.7 excellent
## 19 19 -234.95 -180.10 -65.0 27.3 171.1 235.7 excellent
## 20 20 -234.95 -190.55 -85.2 27.1 154.9 235.7 excellent
## 21 21 -234.95 -217.30 -77.1 27.6 151.5 235.7 excellent
## 22 22 -234.95 -139.70 -15.8 83.0 215.5 235.7 excellent
## 23 23 -249.60 -132.80 -14.1 78.1 205.7 235.7 excellent
## 24 24 -234.95 -152.70 -36.9 29.7 149.8 235.7 excellent
## 25 25 -234.95 -224.10 -81.9 29.1 172.2 235.7 excellent
## 26 26 -234.95 -190.55 -235.8 6.0 144.4 235.7 excellent
## 27 27 -234.95 -202.80 -45.1 84.0 227.3 235.7 excellent
## 28 28 -240.90 -138.40 -21.5 73.4 210.6 235.7 excellent
## 29 29 -247.10 -128.20 -31.3 29.2 143.1 235.7 excellent
## 30 30 -234.95 -185.40 -80.3 23.9 115.8 222.7 excellent
7. BONUS - place the original .csv in a github file and have R read from the link. This will be a very useful skill as you progress in your data science education and career.
Solution:
Create a git project. Go to the project directory
Commands:
echo “# CUNY_R” >> README.md
git init
git add README.md
git add neuro.csv
git status
git commit -m “r data for bridge HW2”
git push -u origin master
require(RCurl)
## Loading required package: RCurl
## Warning: package 'RCurl' was built under R version 3.4.4
## Loading required package: bitops
## Warning: package 'bitops' was built under R version 3.4.4
#Read data from git (Get the raw url to avoid auth errors)
qGitData <- read.csv(text=getURL("https://raw.githubusercontent.com/monuchacko/CUNY_R/master/neuro.csv"), header = TRUE, sep = ",")
#Check the data
head(qGitData)
## X V1 V2 V3 V4 V5 V6
## 1 1 NA -203.7 -84.1 18.5 NA NA
## 2 2 NA -203.0 -97.8 25.8 134.7 NA
## 3 3 NA -249.0 -92.1 27.8 177.1 NA
## 4 4 NA -231.5 -97.5 27.0 150.3 NA
## 5 5 NA NA -130.1 25.8 160.0 NA
## 6 6 NA -223.1 -70.7 62.1 197.5 NA