#1. Reading the CSV file from github
mydata=fread('https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/Stat2Data/Election08.csv')
mydata=data.frame(mydata)
#2. Use the summary function to gain an overview of the data set.
summary(mydata)
## V1 State Abr Income
## Length:51 Length:51 Length:51 Min. :28845
## Class :character Class :character Class :character 1st Qu.:33536
## Mode :character Mode :character Mode :character Median :36047
## Mean :37642
## 3rd Qu.:40544
## Max. :61092
## HS BA Dem.Rep ObamaWin
## Min. :78.50 Min. :17.30 Min. :-23.00 Min. :0.0000
## 1st Qu.:83.00 1st Qu.:24.20 1st Qu.: 3.00 1st Qu.:0.0000
## Median :87.00 Median :25.80 Median : 12.00 Median :1.0000
## Mean :86.00 Mean :27.15 Mean : 12.31 Mean :0.5686
## 3rd Qu.:89.05 3rd Qu.:29.65 3rd Qu.: 19.00 3rd Qu.:1.0000
## Max. :91.20 Max. :47.50 Max. : 75.00 Max. :1.0000
#Then display the mean and median for at least two attributes.
meanandMedian(mydata$Income,mydata$HS)
## [1] 37641.57
## [1] 87
#3.Create a new data frame with a subset of the columns and rows. Make sure to rename it.
newData=subset(mydata, mydata$Income>40000 & mydata$ObamaWin==1)
# Summary and Printing New Data Frame.
summary(newData)
## V1 State Abr Income
## Length:15 Length:15 Length:15 Min. :40322
## Class :character Class :character Class :character 1st Qu.:40821
## Mode :character Mode :character Mode :character Median :41512
## Mean :45015
## 3rd Qu.:48234
## Max. :61092
## HS BA Dem.Rep ObamaWin
## Min. :80.20 Min. :21.80 Min. : 9.00 Min. :1
## 1st Qu.:85.70 1st Qu.:29.90 1st Qu.:14.00 1st Qu.:1
## Median :87.40 Median :32.50 Median :19.00 Median :1
## Mean :86.88 Mean :32.68 Mean :23.27 Mean :1
## 3rd Qu.:88.65 3rd Qu.:34.85 3rd Qu.:26.00 3rd Qu.:1
## Max. :91.00 Max. :47.50 Max. :75.00 Max. :1
print(newData)
## V1 State Abr Income HS BA Dem.Rep ObamaWin
## 5 5 California CA 41571 80.2 29.5 19 1
## 6 6 Colorado CO 41042 88.9 35.0 11 1
## 7 7 Connecticut CT 54117 88.0 34.7 26 1
## 8 8 Delaware DE 40608 87.4 26.1 23 1
## 9 9 District of Columbia DC 61092 85.7 47.5 75 1
## 14 14 Illinois IL 40322 85.7 29.5 24 1
## 21 21 Maryland MD 46021 87.4 35.2 26 1
## 22 22 Massachusetts MA 49082 88.4 37.9 34 1
## 24 24 Minnesota MN 41034 91.0 31.0 15 1
## 29 29 Nevada NV 40480 83.7 21.8 11 1
## 30 30 New Hampshire NH 41512 90.5 32.5 13 1
## 31 31 New Jersey NJ 49194 87.0 33.9 19 1
## 33 33 New York NY 47385 84.1 31.7 27 1
## 47 47 Virginia VA 41347 85.9 33.6 9 1
## 48 48 Washington WA 40414 89.3 30.3 17 1
#4.Create new column names for the new data frame
modata=plyr::rename(newData,c("Income"="fortyKIncome","ObamaWin"="BarackWin"))
print(modata)
## V1 State Abr fortyKIncome HS BA Dem.Rep BarackWin
## 5 5 California CA 41571 80.2 29.5 19 1
## 6 6 Colorado CO 41042 88.9 35.0 11 1
## 7 7 Connecticut CT 54117 88.0 34.7 26 1
## 8 8 Delaware DE 40608 87.4 26.1 23 1
## 9 9 District of Columbia DC 61092 85.7 47.5 75 1
## 14 14 Illinois IL 40322 85.7 29.5 24 1
## 21 21 Maryland MD 46021 87.4 35.2 26 1
## 22 22 Massachusetts MA 49082 88.4 37.9 34 1
## 24 24 Minnesota MN 41034 91.0 31.0 15 1
## 29 29 Nevada NV 40480 83.7 21.8 11 1
## 30 30 New Hampshire NH 41512 90.5 32.5 13 1
## 31 31 New Jersey NJ 49194 87.0 33.9 19 1
## 33 33 New York NY 47385 84.1 31.7 27 1
## 47 47 Virginia VA 41347 85.9 33.6 9 1
## 48 48 Washington WA 40414 89.3 30.3 17 1
#5.Use the summary function to create an overview of your new data frame. The print the mean
#and median for the same two attributes. Please compare.
summary(modata)
## V1 State Abr fortyKIncome
## Length:15 Length:15 Length:15 Min. :40322
## Class :character Class :character Class :character 1st Qu.:40821
## Mode :character Mode :character Mode :character Median :41512
## Mean :45015
## 3rd Qu.:48234
## Max. :61092
## HS BA Dem.Rep BarackWin
## Min. :80.20 Min. :21.80 Min. : 9.00 Min. :1
## 1st Qu.:85.70 1st Qu.:29.90 1st Qu.:14.00 1st Qu.:1
## Median :87.40 Median :32.50 Median :19.00 Median :1
## Mean :86.88 Mean :32.68 Mean :23.27 Mean :1
## 3rd Qu.:88.65 3rd Qu.:34.85 3rd Qu.:26.00 3rd Qu.:1
## Max. :91.00 Max. :47.50 Max. :75.00 Max. :1
meanandMedian(modata$fortyKIncome,modata$HS)
## [1] 45014.73
## [1] 87.4
#5 Replace the value of "e" in dataframe with Excellent. Used gsub for substring.
resultData<-as.data.frame(sapply(modata,gsub,pattern="e",replacement="Excellent",ignore.case = TRUE))
#Printing the resultant modified data.
print(resultData)
## V1 State Abr fortyKIncome HS
## 1 5 California CA 41571 80.2
## 2 6 Colorado CO 41042 88.9
## 3 7 ConnExcellentcticut CT 54117 88
## 4 8 DExcellentlawarExcellent DExcellent 40608 87.4
## 5 9 District of Columbia DC 61092 85.7
## 6 14 Illinois IL 40322 85.7
## 7 21 Maryland MD 46021 87.4
## 8 22 MassachusExcellenttts MA 49082 88.4
## 9 24 MinnExcellentsota MN 41034 91
## 10 29 NExcellentvada NV 40480 83.7
## 11 30 NExcellentw HampshirExcellent NH 41512 90.5
## 12 31 NExcellentw JExcellentrsExcellenty NJ 49194 87
## 13 33 NExcellentw York NY 47385 84.1
## 14 47 Virginia VA 41347 85.9
## 15 48 Washington WA 40414 89.3
## BA Dem.Rep BarackWin
## 1 29.5 19 1
## 2 35 11 1
## 3 34.7 26 1
## 4 26.1 23 1
## 5 47.5 75 1
## 6 29.5 24 1
## 7 35.2 26 1
## 8 37.9 34 1
## 9 31 15 1
## 10 21.8 11 1
## 11 32.5 13 1
## 12 33.9 19 1
## 13 31.7 27 1
## 14 33.6 9 1
## 15 30.3 17 1