This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# 1)
csv_file <- "https://raw.githubusercontent.com/ArcticNick/Rdataset/dd55db0eaf9da8cbe2c39bcd3b12e74885cb980d/Affairs.csv"
data <- read.csv(csv_file)
# Alternatively, having R read from the link
#my_url <- "https://vincentarelbundock.github.io/Rdatasets/csv/AER/Affairs.csv"
#data <- read.csv(my_url)
summary(data)
## X affairs gender age
## Min. : 4 Min. : 0.000 Length:601 Min. :17.50
## 1st Qu.: 528 1st Qu.: 0.000 Class :character 1st Qu.:27.00
## Median :1009 Median : 0.000 Mode :character Median :32.00
## Mean :1060 Mean : 1.456 Mean :32.49
## 3rd Qu.:1453 3rd Qu.: 0.000 3rd Qu.:37.00
## Max. :9029 Max. :12.000 Max. :57.00
## yearsmarried children religiousness education
## Min. : 0.125 Length:601 Min. :1.000 Min. : 9.00
## 1st Qu.: 4.000 Class :character 1st Qu.:2.000 1st Qu.:14.00
## Median : 7.000 Mode :character Median :3.000 Median :16.00
## Mean : 8.178 Mean :3.116 Mean :16.17
## 3rd Qu.:15.000 3rd Qu.:4.000 3rd Qu.:18.00
## Max. :15.000 Max. :5.000 Max. :20.00
## occupation rating
## Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000
## Median :5.000 Median :4.000
## Mean :4.195 Mean :3.932
## 3rd Qu.:6.000 3rd Qu.:5.000
## Max. :7.000 Max. :5.000
mean_age <- mean(data$age)
mean_yearsmarried <- mean(data$yearsmarried)
median_age <- median(data$age)
median_yearsmarried <- median(data$yearsmarried)
print(paste("Mean of age: ", sprintf("%.2f", mean_age)))
## [1] "Mean of age: 32.49"
print(paste("Median of age: ", median_age))
## [1] "Median of age: 32"
print(paste("Mean of Years Married: ", sprintf("%.2f", mean_yearsmarried)))
## [1] "Mean of Years Married: 8.18"
print(paste("Median of yearsmarried: ", median_yearsmarried))
## [1] "Median of yearsmarried: 7"
# 2) and 3)
# subset of age, years married, and gender but for only the top 200 rows from data
subset_data <- slice(select(data, age, yearsmarried, gender), 1:200)
# adding another columns is_male and checks condition
subset_data <- mutate(subset_data, is_male = ifelse(gender == "male", "yes", "nope"))
head(subset_data, 10)
## age yearsmarried gender is_male
## 1 37 10.00 male yes
## 2 27 4.00 female nope
## 3 32 15.00 female nope
## 4 57 15.00 male yes
## 5 22 0.75 male yes
## 6 32 1.50 female nope
## 7 22 0.75 female nope
## 8 57 15.00 male yes
## 9 32 15.00 female nope
## 10 22 1.50 male yes
# 4)
#print summary and print mean and median of the subset data
summary(subset_data)
## age yearsmarried gender is_male
## Min. :17.50 Min. : 0.125 Length:200 Length:200
## 1st Qu.:27.00 1st Qu.: 1.500 Class :character Class :character
## Median :32.00 Median : 7.000 Mode :character Mode :character
## Mean :32.45 Mean : 7.861
## 3rd Qu.:37.00 3rd Qu.:15.000
## Max. :57.00 Max. :15.000
mean_age2 <- mean(subset_data$age)
mean_yearsmarried2 <- mean(subset_data$yearsmarried)
median_age2 <- median(subset_data$age)
median_yearsmarried2 <- median(subset_data$yearsmarried)
# The print the mean and median for the same two attributes
print("Means and Medians from the subset data")
## [1] "Means and Medians from the subset data"
print(paste("Mean of subset age: ", sprintf("%.2f", mean_age2)))
## [1] "Mean of subset age: 32.45"
print(paste("Mean of subset Years Married: ", sprintf("%.2f", mean_yearsmarried2)))
## [1] "Mean of subset Years Married: 7.86"
print(paste("Median of age from subset data: ", median_age2))
## [1] "Median of age from subset data: 32"
print(paste("Median of yearsmarried from the subset data: ", median_yearsmarried2))
## [1] "Median of yearsmarried from the subset data: 7"
# compare the mean and median from the two data set
if (mean_age > mean_age2) {
print("The mean of the age from data is greater than the mean of the subset data.")
} else {
print("The mean of the age from data is less than the mean of the subset data.")
}
## [1] "The mean of the age from data is greater than the mean of the subset data."
if (median_age > median_age2) {
print("The median of the age from data is greater than the mean of the subset data.")
} else {
print("The median of the age from data is less than the mean of the subset data.")
}
## [1] "The median of the age from data is less than the mean of the subset data."
# 5) Changing 3 values in a column: changing the values 1.5, 4, and 15
subset_data <- mutate(subset_data, yearsmarried = ifelse(yearsmarried == 15.00, "number changed", yearsmarried), yearsmarried = ifelse(yearsmarried == 1.5, "one point five", yearsmarried), yearsmarried = ifelse(yearsmarried == 4, "four", yearsmarried))
head(subset_data, 20)
## age yearsmarried gender is_male
## 1 37 10 male yes
## 2 27 four female nope
## 3 32 number changed female nope
## 4 57 number changed male yes
## 5 22 0.75 male yes
## 6 32 one point five female nope
## 7 22 0.75 female nope
## 8 57 number changed male yes
## 9 32 number changed female nope
## 10 22 one point five male yes
## 11 37 number changed male yes
## 12 27 four male yes
## 13 47 number changed male yes
## 14 22 one point five female nope
## 15 27 four female nope
## 16 37 number changed female nope
## 17 37 number changed female nope
## 18 22 0.75 female nope
## 19 22 one point five female nope
## 20 27 10 female nope
# 6) Display enough row to see examples of all steps 1-5 above
head(data,15)
## X affairs gender age yearsmarried children religiousness education
## 1 4 0 male 37 10.00 no 3 18
## 2 5 0 female 27 4.00 no 4 14
## 3 11 0 female 32 15.00 yes 1 12
## 4 16 0 male 57 15.00 yes 5 18
## 5 23 0 male 22 0.75 no 2 17
## 6 29 0 female 32 1.50 no 2 17
## 7 44 0 female 22 0.75 no 2 12
## 8 45 0 male 57 15.00 yes 2 14
## 9 47 0 female 32 15.00 yes 4 16
## 10 49 0 male 22 1.50 no 4 14
## 11 50 0 male 37 15.00 yes 2 20
## 12 55 0 male 27 4.00 yes 4 18
## 13 64 0 male 47 15.00 yes 5 17
## 14 80 0 female 22 1.50 no 2 17
## 15 86 0 female 27 4.00 no 4 14
## occupation rating
## 1 7 4
## 2 6 4
## 3 1 4
## 4 6 5
## 5 6 3
## 6 5 5
## 7 1 3
## 8 4 4
## 9 1 2
## 10 4 5
## 11 7 2
## 12 6 4
## 13 6 4
## 14 5 4
## 15 5 4
head(subset_data,15)
## age yearsmarried gender is_male
## 1 37 10 male yes
## 2 27 four female nope
## 3 32 number changed female nope
## 4 57 number changed male yes
## 5 22 0.75 male yes
## 6 32 one point five female nope
## 7 22 0.75 female nope
## 8 57 number changed male yes
## 9 32 number changed female nope
## 10 22 one point five male yes
## 11 37 number changed male yes
## 12 27 four male yes
## 13 47 number changed male yes
## 14 22 one point five female nope
## 15 27 four female nope