This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Dataset: incomedata.csv
Checking dataset
# Preview
head(df)
# Check datatype
str(df)
## 'data.frame': 700 obs. of 4 variables:
## $ agecat: chr "31-44" "Up to 30" "31-44" "31-44" ...
## $ educ : chr "Some college" "Did not complete high school" "Did not complete high school" "Did not complete high school" ...
## $ gender: chr "Female" "Male" "Male" "Male" ...
## $ income: int 176 31 55 120 28 25 67 38 19 25 ...
# Check unique values
table(df$agecat)
##
## 31-44 Over 44 Up to 30
## 360 98 242
table(df$educ)
##
## College degree Did not complete high school
## 38 372
## High school degree Post-undergraduate degree
## 198 5
## Some college
## 87
table(df$gender)
##
## Female Male
## 183 517
Select the variables age category (agecat) and gender for the subjects who did not complete high school
df1 <- df %>%
filter( educ == 'Did not complete high school') %>%
select('agecat', 'gender', 'educ')
head(df1)
Select the variables age category and gender for the subjects who did not complete high school and have the income greater than 20
df2 <- df %>%
filter(educ=='Did not complete high school', income>20) %>%
select('agecat', 'gender')
head(df2)
Select the male subjects with the income lower than 65 and compute their average income
(average_income <- df %>%
filter(gender=='Male', income<65) %>%
summarise(avginc=mean(income)))
Group the subjects by age category and compute the mean and the standard deviation of the income
(df4 <- df %>%
group_by(agecat) %>%
summarise(avgincome=mean(income), stdincome=sd(income)))
Sort the data frame by income, in ascending and descending order
# Sort ascending
head(arrange(df, income))
# Sort descending
head(arrange(df, desc(income)))
Count the number of subjects in each education category
count(df, educ)
Select 150 subjects at random, with and without replacement
# With replacement
slice_sample(df, n = 100, replace = TRUE)
# Without replacement
slice_sample(df, n = 100, replace = FALSE)
Select 25% of the subjects at random, with and without replacement
# With replacement
slice_sample(df, prop = 0.5, replace = TRUE)
# Without replacement
slice_sample(df, prop = 0.5, replace = FALSE)