Week 3 Workbook

Data Analysis

1) Exercises 6.6.1

Problem A

midwest %>% # utilises the midwest dataset

group_by(state) %>% # groups data by the State variable

summarize(poptotalmean = mean(poptotal), poptotalmed = median(poptotal), popmax = max(poptotal), popmin = min(poptotal), popdistinct = n_distinct(poptotal), popfirst = first(poptotal), popany = any(poptotal < 5000), popany2 = any(poptotal > 2000000)) %>% # summarises population total values by mean, median, maximum, minimum, unique, first, less than 5000 and greater than 2000000

ungroup() #ungroups data

Problem B

midwest %>% ## #utilises the midwest dataset

group_by(state) %>% ## # groups data by the State variable

summarise(num5k = sum(poptotal > 5000), num2mil = sum(poptotal > 2000000), numrows = n()) %>% ## #summarises data by areas where total population is less than 5000 and greater than 2000000 and total number of rows

ungroup () ## #ungroups data

Problem C - Part 1

midwest %>% #utilises the midwest dataset

group_by(county) %>% # groups data by the country variable

summarize(x = n_distinct(state)) %>% # summarises number of unique states and displays in new field,x

arrange(desc(x)) %>% #displays field x in descending order

ungroup() #ungroups data

Problem C - Part 2

midwest %>% #utilises the midwest dataset group_by(county) %>% # groups data by the country variable summarize(x = n()) %>% # summarises number of all counties and displays in new field,x

ungroup() #ungroups data

Problem C - Part 3

midwest %>% #utilises the midwest dataset group_by(county) %>% # groups data by the country variable summarize(x = n_distinct(county)) %>% # summarises number of unique counties and displays in new field,x

ungroup() #ungroups data

Problem D

diamonds %>% # utilises the diamonds dataset

group_by(clarity) %>% # groups data by the clarity variable

summarize(a = n_distinct(color), b = n_distinct(price), c = n()) #summarises data by unique colour, unique price and number of observations

%>% ungroup() #ungroups the data

##Problem E - Part 1

diamonds %>% group_by(color, cut) %>% # groups data by the color then by cut variable

summarize(m = mean(price), s = sd(price)) %>% # summarises the mean price and standard deviation

ungroup() #ungroups the data

##Problem E - Part 2

diamonds %>% # utilises the diamonds dataset group_by(cut, color) %>% # groups data by the cut then by color variable summarize(m = mean(price), s = sd(price)) %>% # summarises the mean price and standard deviation

ungroup() #ungroups the data

Problem E - Part 3

diamonds %>% # utilises the diamonds dataset

group_by(cut, color, clarity) %>% # groups data by the cut, then color, then clarity variable

summarize(m = mean(price), s = sd(price), msale = m * 0.80) %>% # summarises the mean price and standard deviation, then displays another field calculating 80% of the mean price

ungroup() #ungroups the data

Problem F

diamonds %>% # utilises the diamonds dataset

group_by(cut) %>% # groups data by the cut variable

summarize(potato = mean(depth), pizza = mean(price), popcorn = median(y), pineapple = potato - pizza, papaya = pineapple ^ 2, peach = n()) %>% # summarises and adds new fields for mean depth, mean price, median of column y,(mean depth-mean price), ? , number of values

ungroup() #ungroups the data

Problem G - Part 1

diamonds %>% # utilises the diamonds dataset

group_by(color) %>% # groups data by the color variable

summarize(m = mean(price)) %>% # summarises and adds new fields for mean price

mutate(x1 = str_c(“Diamond color”, color), x2 = 5) %>% # summarises and adds a new field including specific text and a value from the color field and another field with a set value

ungroup() #ungroups the data

Problem G - Part 2

diamonds %>% # utilises the diamonds dataset

group_by(color) %>% # groups data by the color variable

summarize(m = mean(price)) %>% # summarises and adds a new field for mean price

ungroup() %>% #ungroups the data

mutate(x1 = str_c(“Diamond color”, color), x2 = 5) # summarises and adds a new field including specific text and a value from the color field and another field with a set value

Problem H - Part 1

diamonds %>% # utilises the diamonds dataset group_by(color) %>% # groups data by the color variable

mutate(x1 = price * 0.5) %>% #Creates a new field displaying 50% of the price

summarize(m = mean(x1)) %>% #Summarises the mean of the new field X1

ungroup() #ungroups the data

Problem H - Part 2

diamonds %>% # utilises the diamonds dataset

group_by(color) %>% # groups data by the color variable

mutate(x1 = price * 0.5) %>% #Creates a new field displaying 50% of the price

ungroup() %>% #ungroups the data

summarize(m = mean(x1)) #Summarises the overall mean of all colors as the color categrory has already been ungrouped

Research Methods

Exercise 1

A good question about the diamonds dataset

Are diamonds with a greater depth more expensive?

A bad question about the diamonds dataset

How do features of a diamond affect price?