I uploaded the original .csv in a github file and read from the link (https://raw.githubusercontent.com/sigmasigmaiota/bfi/master/bfi.csv).
#clean the workspace.
rm(list=ls())
#getURL requires RCurl.
require(RCurl)
## Loading required package: RCurl
## Warning: package 'RCurl' was built under R version 3.4.4
## Loading required package: bitops
#import csv from github and name "bfi"
bfi<-read.csv(text=getURL("https://raw.githubusercontent.com/sigmasigmaiota/bfi/master/bfi.csv"))
#check column headings
head(bfi)
## X.1 X A1 A2 A3 A4 A5 C1 C2 C3 C4 C5 E1 E2 E3 E4 E5 N1 N2 N3 N4 N5 O1
## 1 1 61617 2 4 3 4 4 2 3 3 4 4 3 3 3 4 4 3 4 2 2 3 3
## 2 2 61618 2 4 5 2 5 5 4 4 3 4 1 1 6 4 3 3 3 3 5 5 4
## 3 3 61620 5 4 5 4 4 4 5 4 2 5 2 4 4 4 5 4 5 4 2 3 4
## 4 4 61621 4 4 6 5 5 4 4 3 5 5 5 3 4 4 4 2 5 2 4 1 3
## 5 5 61622 2 3 3 4 5 4 4 5 3 2 2 2 5 4 5 2 3 4 4 3 3
## 6 6 61623 6 6 5 6 5 6 6 6 1 3 2 1 6 5 6 3 5 2 2 3 4
## O2 O3 O4 O5 gender education age
## 1 6 3 4 3 1 NA 16
## 2 2 4 3 3 2 NA 18
## 3 2 5 5 2 2 NA 17
## 4 3 4 3 5 2 NA 17
## 5 3 4 3 3 1 NA 17
## 6 3 5 6 1 2 3 21
#get summary
summary(bfi)
## X.1 X A1 A2
## Min. : 1.0 Min. :61617 Min. :1.000 Min. :1.000
## 1st Qu.: 700.8 1st Qu.:63080 1st Qu.:1.000 1st Qu.:4.000
## Median :1400.5 Median :64575 Median :2.000 Median :5.000
## Mean :1400.5 Mean :64599 Mean :2.413 Mean :4.802
## 3rd Qu.:2100.2 3rd Qu.:66092 3rd Qu.:3.000 3rd Qu.:6.000
## Max. :2800.0 Max. :67560 Max. :6.000 Max. :6.000
## NA's :16 NA's :27
## A3 A4 A5 C1
## Min. :1.000 Min. :1.0 Min. :1.00 Min. :1.000
## 1st Qu.:4.000 1st Qu.:4.0 1st Qu.:4.00 1st Qu.:4.000
## Median :5.000 Median :5.0 Median :5.00 Median :5.000
## Mean :4.604 Mean :4.7 Mean :4.56 Mean :4.502
## 3rd Qu.:6.000 3rd Qu.:6.0 3rd Qu.:5.00 3rd Qu.:5.000
## Max. :6.000 Max. :6.0 Max. :6.00 Max. :6.000
## NA's :26 NA's :19 NA's :16 NA's :21
## C2 C3 C4 C5
## Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.00 1st Qu.:4.000 1st Qu.:1.000 1st Qu.:2.000
## Median :5.00 Median :5.000 Median :2.000 Median :3.000
## Mean :4.37 Mean :4.304 Mean :2.553 Mean :3.297
## 3rd Qu.:5.00 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :6.00 Max. :6.000 Max. :6.000 Max. :6.000
## NA's :24 NA's :20 NA's :26 NA's :16
## E1 E2 E3 E4
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:4.000
## Median :3.000 Median :3.000 Median :4.000 Median :5.000
## Mean :2.974 Mean :3.142 Mean :4.001 Mean :4.422
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:6.000
## Max. :6.000 Max. :6.000 Max. :6.000 Max. :6.000
## NA's :23 NA's :16 NA's :25 NA's :9
## E5 N1 N2 N3
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :5.000 Median :3.000 Median :4.000 Median :3.000
## Mean :4.416 Mean :2.929 Mean :3.508 Mean :3.217
## 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :6.000 Max. :6.000 Max. :6.000 Max. :6.000
## NA's :21 NA's :22 NA's :21 NA's :11
## N4 N5 O1 O2
## Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:4.000 1st Qu.:1.000
## Median :3.000 Median :3.00 Median :5.000 Median :2.000
## Mean :3.186 Mean :2.97 Mean :4.816 Mean :2.713
## 3rd Qu.:4.000 3rd Qu.:4.00 3rd Qu.:6.000 3rd Qu.:4.000
## Max. :6.000 Max. :6.00 Max. :6.000 Max. :6.000
## NA's :36 NA's :29 NA's :22
## O3 O4 O5 gender
## Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000
## 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:1.00 1st Qu.:1.000
## Median :5.000 Median :5.000 Median :2.00 Median :2.000
## Mean :4.438 Mean :4.892 Mean :2.49 Mean :1.672
## 3rd Qu.:5.000 3rd Qu.:6.000 3rd Qu.:3.00 3rd Qu.:2.000
## Max. :6.000 Max. :6.000 Max. :6.00 Max. :2.000
## NA's :28 NA's :14 NA's :20
## education age
## Min. :1.00 Min. : 3.00
## 1st Qu.:3.00 1st Qu.:20.00
## Median :3.00 Median :26.00
## Mean :3.19 Mean :28.78
## 3rd Qu.:4.00 3rd Qu.:35.00
## Max. :5.00 Max. :86.00
## NA's :223
There is an error in the data. According to the codebook accompanying the dataset the value labels for the education variable are as follows:
1 = HS
2 = finished HS
3 = some college
4 = college graduate
5 = graduate degree
Sorting the data by age exposes a 3-year-old with a graduate degree and an 11-year-old with some college.
# using subset function
error <- subset(bfi, age <= 15 & education>=1,
select=c(education, age))
error
## education age
## 487 1 15
## 1139 3 11
## 1663 1 15
## 2356 1 14
## 2481 1 14
## 2523 1 15
## 2537 5 3
I’ll reassign education for these two cases as missing and check the result.
bfi$education[bfi$education >=1 & bfi$age <= 11] <- NA
error2 <- subset(bfi, age <= 15 & education>=1,
select=c(education, age))
error2
## education age
## 487 1 15
## 1663 1 15
## 2356 1 14
## 2481 1 14
## 2523 1 15
#display mean and median for attributes; there are missing values in education.
cat("The mean age is",mean(bfi$age),"\n")
## The mean age is 28.78214
cat("The median age is",median(bfi$age),"\n")
## The median age is 26
#remove missing values from the education level mean and median
cat("After omitting missing values, the mean education level is",mean(bfi$education,na.rm=TRUE),"\n")
## After omitting missing values, the mean education level is 3.189515
cat("After omitting missing values, the median education level is",median(bfi$education,na.rm=TRUE),"\n")
## After omitting missing values, the median education level is 3
bfi.sample <- subset(bfi, age>=20 & age<=30, select=c(gender,education,age))
The following loop appends “.bfi” to every column name in bfi.sample.
for(i in 1:ncol(bfi.sample)){
oldname<-colnames(bfi.sample)[i]
names(bfi.sample)[i]<-paste(oldname,".bfi",sep="")
}
colnames(bfi.sample)
## [1] "gender.bfi" "education.bfi" "age.bfi"
#check summary
summary(bfi.sample)
## gender.bfi education.bfi age.bfi
## Min. :1.000 Min. :1.000 Min. :20.00
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.:21.00
## Median :2.000 Median :3.000 Median :24.00
## Mean :1.648 Mean :3.217 Mean :24.05
## 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.:27.00
## Max. :2.000 Max. :5.000 Max. :30.00
## NA's :8
cat("The bfi.sample mean age is",mean(bfi.sample$age.bfi),"\n")
## The bfi.sample mean age is 24.05083
cat("The bfi.sample mean age is",median(bfi.sample$age.bfi),"\n")
## The bfi.sample mean age is 24
cat("Omitting missing values, the bfi.sample mean education level is",mean(bfi.sample$education.bfi,na.rm=TRUE),"\n")
## Omitting missing values, the bfi.sample mean education level is 3.217426
cat("Omitting missing values, the bfi.sample median education level is",median(bfi.sample$education.bfi,na.rm=TRUE),"\n")
## Omitting missing values, the bfi.sample median education level is 3
#create a table for comparison
tbl <- matrix(c(mean(bfi$age),median(bfi$age),mean(bfi$education,na.rm=TRUE),median(bfi$education,na.rm=TRUE),
mean(bfi.sample$age.bfi),median(bfi.sample$age.bfi),
mean(bfi.sample$education.bfi,na.rm=TRUE),median(bfi.sample$education.bfi,na.rm=TRUE)),ncol=4,byrow=TRUE)
colnames(tbl) <- c("MeanAge","MedianAge","MeanEduc","MedianEduc")
rownames(tbl) <- c("bfi","bfi.sample")
tbl <- as.table(tbl)
tbl
## MeanAge MedianAge MeanEduc MedianEduc
## bfi 28.782143 26.000000 3.189515 3.000000
## bfi.sample 24.050834 24.000000 3.217426 3.000000
barplot(tbl,legend=T,beside=T,main='Comparison of Means')
#following values >= 1 are taken from the codebook.
bfi.sample$education.bfi[is.na(bfi.sample$education.bfi)] <- "data unavailable"
bfi.sample$education.bfi[bfi.sample$education.bfi==1] <- "HS"
bfi.sample$education.bfi[bfi.sample$education.bfi==2] <- "finished HS"
bfi.sample$education.bfi[bfi.sample$education.bfi==3] <- "some college"
bfi.sample$education.bfi[bfi.sample$education.bfi==4] <- "college graduate"
bfi.sample$education.bfi[bfi.sample$education.bfi==5] <- "graduate degree"
head(bfi.sample, n=20)
## gender.bfi education.bfi age.bfi
## 6 2 some college 21
## 11 1 HS 21
## 24 2 finished HS 27
## 26 2 some college 20
## 33 1 graduate degree 23
## 35 1 some college 20
## 36 1 some college 23
## 37 1 some college 20
## 38 1 some college 21
## 39 1 data unavailable 30
## 42 2 college graduate 27
## 44 1 college graduate 20
## 45 2 graduate degree 24
## 46 1 some college 25
## 47 1 finished HS 22
## 50 1 some college 20
## 51 2 some college 24
## 52 2 college graduate 26
## 53 1 college graduate 26
## 54 2 some college 25