Week 3 Workbook
Data Analysis
1) Exercises 6.6.1
Problem A
midwest %>% # utilises the midwest dataset
group_by(state) %>% # groups data by the State variable
summarize(poptotalmean = mean(poptotal), poptotalmed = median(poptotal), popmax = max(poptotal), popmin = min(poptotal), popdistinct = n_distinct(poptotal), popfirst = first(poptotal), popany = any(poptotal < 5000), popany2 = any(poptotal > 2000000)) %>% # summarises population total values by mean, median, maximum, minimum, unique, first, less than 5000 and greater than 2000000
ungroup() #ungroups data
Problem B
midwest %>% ## #utilises the midwest dataset
group_by(state) %>% ## # groups data by the State variable
summarise(num5k = sum(poptotal > 5000), num2mil = sum(poptotal > 2000000), numrows = n()) %>% ## #summarises data by areas where total population is less than 5000 and greater than 2000000 and total number of rows
ungroup () ## #ungroups data
Problem C - Part 1
midwest %>% #utilises the midwest dataset
group_by(county) %>% # groups data by the country variable
summarize(x = n_distinct(state)) %>% # summarises number of unique states and displays in new field,x
arrange(desc(x)) %>% #displays field x in descending order
ungroup() #ungroups data
Problem C - Part 2
midwest %>% #utilises the midwest dataset group_by(county) %>% # groups data by the country variable summarize(x = n()) %>% # summarises number of all counties and displays in new field,x
ungroup() #ungroups data
Problem C - Part 3
midwest %>% #utilises the midwest dataset group_by(county) %>% # groups data by the country variable summarize(x = n_distinct(county)) %>% # summarises number of unique counties and displays in new field,x
ungroup() #ungroups data
Problem D
diamonds %>% # utilises the diamonds dataset
group_by(clarity) %>% # groups data by the clarity variable
summarize(a = n_distinct(color), b = n_distinct(price), c = n()) #summarises data by unique colour, unique price and number of observations
%>% ungroup() #ungroups the data
##Problem E - Part 1
diamonds %>% group_by(color, cut) %>% # groups data by the color then by cut variable
summarize(m = mean(price), s = sd(price)) %>% # summarises the mean price and standard deviation
ungroup() #ungroups the data
##Problem E - Part 2
diamonds %>% # utilises the diamonds dataset group_by(cut, color) %>% # groups data by the cut then by color variable summarize(m = mean(price), s = sd(price)) %>% # summarises the mean price and standard deviation
ungroup() #ungroups the data
Problem E - Part 3
diamonds %>% # utilises the diamonds dataset
group_by(cut, color, clarity) %>% # groups data by the cut, then color, then clarity variable
summarize(m = mean(price), s = sd(price), msale = m * 0.80) %>% # summarises the mean price and standard deviation, then displays another field calculating 80% of the mean price
ungroup() #ungroups the data
Problem F
diamonds %>% # utilises the diamonds dataset
group_by(cut) %>% # groups data by the cut variable
summarize(potato = mean(depth), pizza = mean(price), popcorn = median(y), pineapple = potato - pizza, papaya = pineapple ^ 2, peach = n()) %>% # summarises and adds new fields for mean depth, mean price, median of column y,(mean depth-mean price), ? , number of values
ungroup() #ungroups the data
Problem G - Part 1
diamonds %>% # utilises the diamonds dataset
group_by(color) %>% # groups data by the color variable
summarize(m = mean(price)) %>% # summarises and adds new fields for mean price
mutate(x1 = str_c(“Diamond color”, color), x2 = 5) %>% # summarises and adds a new field including specific text and a value from the color field and another field with a set value
ungroup() #ungroups the data
Problem G - Part 2
diamonds %>% # utilises the diamonds dataset
group_by(color) %>% # groups data by the color variable
summarize(m = mean(price)) %>% # summarises and adds a new field for mean price
ungroup() %>% #ungroups the data
mutate(x1 = str_c(“Diamond color”, color), x2 = 5) # summarises and adds a new field including specific text and a value from the color field and another field with a set value
Problem H - Part 1
diamonds %>% # utilises the diamonds dataset group_by(color) %>% # groups data by the color variable
mutate(x1 = price * 0.5) %>% #Creates a new field displaying 50% of the price
summarize(m = mean(x1)) %>% #Summarises the mean of the new field X1
ungroup() #ungroups the data
Problem H - Part 2
diamonds %>% # utilises the diamonds dataset
group_by(color) %>% # groups data by the color variable
mutate(x1 = price * 0.5) %>% #Creates a new field displaying 50% of the price
ungroup() %>% #ungroups the data
summarize(m = mean(x1)) #Summarises the overall mean of all colors as the color categrory has already been ungrouped
Research Methods
Exercise 1
A good question about the diamonds dataset
Are diamonds with a greater depth more expensive?
A bad question about the diamonds dataset
How do features of a diamond affect price?