library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data<-read.table('/Users/MacBook/Desktop/pop_dataset_0002.txt',sep = ',', header = TRUE)
#1.1 Print out the dimensions of the data frame.
print(dim(data))
## [1] 56000 4
#1.2 Print out the names and type of each of the data frame’s columns.
print(sapply(data,class))
## region age gender population
## "character" "integer" "character" "integer"
#1.3 Print out the number of unique regions in the dataset.
print(count(distinct(data, region)))
## n
## 1 500
#1.4 What is the minimum age bin?
print(summarise(data , min = min(age)))
## min
## 1 0
#1.5 What is the maximum age bin?
print(summarise(data , max= max(age)))
## max
## 1 55
#1.6 What is the bin size for the age field?
print(count(distinct(data,age)))
## n
## 1 56
Q 2)
#2.1 Use the expected value for the age to find the mean age for the whole data sample.
Age <- rep(data$age , data$population)
print(mean(Age))
## [1] 27.80027
#2.2 Provide the standard deviation for the whole data sample.
print(sd(Age))
## [1] 15.77805
Q 3 )
data_for_summary_statistics<-data.frame('Region'=rep((data$region), data$population),
'Age'=rep((data$age), data$population))
#data frame for means for each region
region_age_mean<-data_for_summary_statistics %>%
group_by(Region) %>%
dplyr::summarize(Mean = mean(Age))
## `summarise()` ungrouping output (override with `.groups` argument)
region_age_mean <-as.data.frame(region_age_mean[c(1,2)])
#3.1 mean age
print(summarise(region_age_mean, mean = mean(Mean)))
## mean
## 1 30.60846
#3.2 standard deviation
print(summarise(region_age_mean, sd = sd(Mean)))
## sd
## 1 7.996179
#3.3 minimum
print(summarise(region_age_mean, Min = min(Mean)))
## Min
## 1 2
#3.4 first quartile
print(summarise(region_age_mean, Q1 = quantile(Mean,0.25)))
## Q1
## 1 27.42578
#3.5 median
print(summarise(region_age_mean, Q2 = quantile(Mean,0.5)))
## Q2
## 1 29.23158
#3.6 third quartile
print(summarise(region_age_mean, Q3 = quantile(Mean,0.75)))
## Q3
## 1 33.35013
#3.7 maximum
print(summarise(region_age_mean, Max = max(Mean)))
## Max
## 1 55
#3.8 interquartile range
print(summarise(region_age_mean, Max = max(Mean)))
## Max
## 1 55
#3.9 histogram plot of the distribution of means from each region
qplot(region_age_mean$Mean,
geom="histogram",
binwidth = 5,
main = "Histogram for Mean age",
xlab = "Age",
fill=I("white"),
col=I("red"),
alpha=I(.5))
Q 4 )
#4.1 Show which region has the least people and how many it has. #data frame for means for each region
total_population_by_region<-data %>%
group_by(region) %>%
dplyr::summarize(sum = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
total_population_by_region <-as.data.frame(total_population_by_region[c(1,2)])
total_population_by_region %>%
filter(sum == min(sum))
## region sum
## 1 SSC20099 3
## 2 SSC20127 3
## 3 SSC20151 3
## 4 SSC20346 3
## 5 SSC20383 3
## 6 SSC20398 3
## 7 SSC20422 3
## 8 SSC20502 3
## 9 SSC20503 3
## 10 SSC20516 3
## 11 SSC21012 3
## 12 SSC21026 3
## 13 SSC21037 3
## 14 SSC21283 3
## 15 SSC22012 3
## 16 SSC22021 3
## 17 SSC22028 3
## 18 SSC22070 3
## 19 SSC22084 3
## 20 SSC22157 3
## 21 SSC22193 3
## 22 SSC22237 3
## 23 SSC22281 3
## 24 SSC22367 3
## 25 SSC22597 3
## 26 SSC22605 3
## 27 SSC22606 3
## 28 SSC22616 3
## 29 SSC22698 3
## 30 SSC22702 3
## 31 SSC22719 3
## 32 SSC22772 3
## 33 SSC22809 3
## 34 SSC22840 3
## 35 SSC22873 3
#4.2 Plot the distribution of ages for the region with the least people.
# Get cities wit minimum population
least_population_cities<-total_population_by_region %>%
filter(sum == 3)
# Joined with corresponsing ages
joined_table <- left_join(least_population_cities, data,
by = c("region" = "region"))
# Dataset with ages
age_dataset<- data.frame('Age'=rep((joined_table$age), joined_table$population))
# Histogram of the ages
qplot(age_dataset$Age,
geom="histogram",
binwidth = 5,
main = "Histogram for age of regions with the least people",
xlab = "Age",
fill=I("blue"),
col=I("black"),
alpha=I(.5))
#Q.5
# Findng region with maximum population
highest_population_city<-total_population_by_region %>%
filter(sum == max(sum))
# Joined with corresponsing ages
joined_table_2 <- left_join(highest_population_city, data,
by = c("region" = "region"))
#5.1 Plot cumulative distribution for the regions with the most people.
ggplot(joined_table_2, aes(population)) + stat_ecdf(geom = "step")
#5.2 Plot the cumulative distribution for males and females on the same plot.
ggplot(joined_table_2,aes(x = population, colour= gender)) +
stat_ecdf( geom = "step")
#Q.6
#Young People
Young_people<-data_for_summary_statistics %>%
filter(Age < 40)
#Old people
older_people<-data_for_summary_statistics %>%
filter(Age >= 40)
#get count of young and old people
Young_people_by_region<-Young_people %>%group_by(Region)%>%count(Region)
older_people_by_region<-older_people %>%group_by(Region)%>%count(Region)
# merge young and old people
young_older_people_count <-left_join(Young_people_by_region, older_people_by_region,
by = c("Region" = "Region"))
#Remove missing values
young_older_people_count<- na.omit(young_older_people_count)
#Ratio calculation
young_older_people_count$ratio<-young_older_people_count$n.y/young_older_people_count$n.x
# Merging population with raio
age_ratio <-left_join(total_population_by_region, young_older_people_count,
by = c("region" = "Region"))
#Remove missing values
age_ratio<- na.omit(age_ratio)
#Scatterplot
ggplot(age_ratio, aes(x = sum, y = ratio)) +
geom_point() +labs(
title = "Plot of old to young people ratio against population",
x = "population",
y = "Old to young people ratio")
#Q.7
#Data for gender calculation
data_for_gender_calculation<-data.frame('Region'=rep((data$region), data$population),
'Gender'=rep((data$gender), data$population))
#Males
males<-data_for_gender_calculation %>%
filter(Gender=='M')
#Females
females<-data_for_gender_calculation %>%
filter(Gender=='F')
#get count of males and females
males_by_region<-males %>%group_by(Region)%>%count(Region)
females_by_region<-females %>%group_by(Region)%>%count(Region)
# merge young and old people
gender_count <-left_join(males_by_region, females_by_region,
by = c("Region" = "Region"))
#Remove missing values
gender_count<- na.omit(gender_count)
#Gender Ratio calculation
gender_count$ratio<-gender_count$n.x/gender_count$n.y
# Merging population with raio
gender_ratio <-left_join(total_population_by_region, gender_count,
by = c("region" = "Region"))
#Remove missing values
gender_ratio<- na.omit(gender_ratio)
#Scatterplot
ggplot(gender_ratio, aes(x = sum, y = ratio)) +
geom_point() +labs(
title = "Plot of gender ratio (male to female) against population",
x = "population",
y = "male to female ratio")
#Q.8
# 8.1
#Calculation total population by gender
total_population_by_gender<-data %>%
group_by(gender) %>%
dplyr::summarize(sum = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
#Finding which gender groups have higher population
print(total_population_by_gender %>% arrange(desc(sum)))
## # A tibble: 2 x 2
## gender sum
## <chr> <int>
## 1 F 400505
## 2 M 395510
#Calculation total population by age
total_population_by_age<-data %>%
group_by(age) %>%
dplyr::summarize(sum = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
#Finding which gender groups have higher population
print(head(total_population_by_age %>% arrange(desc(sum))))
## # A tibble: 6 x 2
## age sum
## <int> <int>
## 1 30 16823
## 2 31 16673
## 3 28 16655
## 4 33 16510
## 5 29 16462
## 6 26 16449
# 8.2
# Region with higher females
print(head(females_by_region %>% arrange(desc(n))))
## # A tibble: 6 x 2
## # Groups: Region [6]
## Region n
## <chr> <int>
## 1 SSC22015 19303
## 2 SSC21671 11715
## 3 SSC21125 10220
## 4 SSC21143 9684
## 5 SSC20911 9654
## 6 SSC20773 9491
# Region with higher population with age group 28<= age <= 31
age_filtered_data<-data %>%
filter(28<= age & age <= 31 )
# Total population
total_population_by_region_age<-age_filtered_data %>%
group_by(region, age) %>%
dplyr::summarize(sum = sum(population))
## `summarise()` regrouping output by 'region' (override with `.groups` argument)
total_population_by_region_age <-as.data.frame(total_population_by_region_age[c(1,2,3)])
print(head(total_population_by_region_age%>% arrange(desc(sum))))
## region age sum
## 1 SSC22015 30 937
## 2 SSC22015 31 906
## 3 SSC22015 28 803
## 4 SSC22015 29 791
## 5 SSC22569 31 731
## 6 SSC21143 29 667
#Q.9
#Selecting the region with higher population data
print(head(total_population_by_region%>% arrange(desc(sum))))
## region sum
## 1 SSC22015 37948
## 2 SSC21671 22979
## 3 SSC21125 20939
## 4 SSC20911 19340
## 5 SSC22569 19274
## 6 SSC21143 19180
age<- data %>%
filter(region=='SSC22015')
data_for_age<-data.frame('Region'=rep((age$region), age$population),
'Age'=rep((age$age), age$population))
# 9.1
# When N=1
n<-1
xbar<-rep(NA,n)
for (i in 1: n){
mysample<-sample(data_for_age$Age, size = 100)
xbar[i]<-mean(mysample)
}
mean(xbar)
## [1] 27.74
sd(xbar)
## [1] NA
# 9.2
# When N=2
n<-2
xbar<-rep(NA,n)
for (i in 1: n){
mysample<-sample(data_for_age$Age, size = 100)
xbar[i]<-mean(mysample)
}
mean(xbar)
## [1] 24.985
sd(xbar)
## [1] 1.407142
hist(xbar)
lines(density(xbar), col="blue", lwd=2)
# 9.3
# When N=10
n<-10
xbar<-rep(NA,n)
for (i in 1: n){
mysample<-sample(data_for_age$Age, size = 100)
xbar[i]<-mean(mysample)
}
mean(xbar)
## [1] 25.524
sd(xbar)
## [1] 1.677102
hist(xbar)
lines(density(xbar), col="blue", lwd=2)
# When N=100
n<-100
xbar<-rep(NA,n)
for (i in 1: n){
mysample<-sample(data_for_age$Age, size = 100)
xbar[i]<-mean(mysample)
}
mean(xbar)
## [1] 25.5372
sd(xbar)
## [1] 1.383116
hist(xbar)
lines(density(xbar), col="blue", lwd=2)
# When N=1000
n<-1000
xbar<-rep(NA,n)
for (i in 1: n){
mysample<-sample(data_for_age$Age, size = 100)
xbar[i]<-mean(mysample)
}
mean(xbar)
## [1] 25.49208
sd(xbar)
## [1] 1.606438
hist(xbar)
lines(density(xbar), col="blue", lwd=2)