library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
data<-read.table('/Users/MacBook/Desktop/pop_dataset_0002.txt',sep = ',', header = TRUE)

#1.1 Print out the dimensions of the data frame.

print(dim(data))
## [1] 56000     4

#1.2 Print out the names and type of each of the data frame’s columns.

print(sapply(data,class))
##      region         age      gender  population 
## "character"   "integer" "character"   "integer"

#1.3 Print out the number of unique regions in the dataset.

print(count(distinct(data, region)))
##     n
## 1 500

#1.4 What is the minimum age bin?

print(summarise(data , min = min(age)))
##   min
## 1   0

#1.5 What is the maximum age bin?

print(summarise(data , max= max(age)))
##   max
## 1  55

#1.6 What is the bin size for the age field?

print(count(distinct(data,age)))
##    n
## 1 56

Q 2)

#2.1 Use the expected value for the age to find the mean age for the whole data sample.

Age <- rep(data$age , data$population)
print(mean(Age))
## [1] 27.80027

#2.2 Provide the standard deviation for the whole data sample.

print(sd(Age))
## [1] 15.77805

Q 3 )

data_for_summary_statistics<-data.frame('Region'=rep((data$region), data$population),
                                           'Age'=rep((data$age), data$population))
#data frame for means for each region 
region_age_mean<-data_for_summary_statistics %>%
  group_by(Region) %>%
  dplyr::summarize(Mean = mean(Age))
## `summarise()` ungrouping output (override with `.groups` argument)
region_age_mean <-as.data.frame(region_age_mean[c(1,2)])

#3.1 mean age

print(summarise(region_age_mean, mean = mean(Mean)))
##       mean
## 1 30.60846

#3.2 standard deviation

print(summarise(region_age_mean, sd = sd(Mean)))
##         sd
## 1 7.996179

#3.3 minimum

print(summarise(region_age_mean, Min = min(Mean)))
##   Min
## 1   2

#3.4 first quartile

print(summarise(region_age_mean, Q1 = quantile(Mean,0.25)))
##         Q1
## 1 27.42578

#3.5 median

print(summarise(region_age_mean, Q2 = quantile(Mean,0.5)))
##         Q2
## 1 29.23158

#3.6 third quartile

print(summarise(region_age_mean, Q3 = quantile(Mean,0.75)))
##         Q3
## 1 33.35013

#3.7 maximum

print(summarise(region_age_mean, Max = max(Mean)))
##   Max
## 1  55

#3.8 interquartile range

print(summarise(region_age_mean, Max = max(Mean)))
##   Max
## 1  55

#3.9 histogram plot of the distribution of means from each region

qplot(region_age_mean$Mean,
      geom="histogram",
      binwidth = 5,  
      main = "Histogram for Mean age", 
      xlab = "Age",  
      fill=I("white"), 
      col=I("red"), 
      alpha=I(.5))

Q 4 )

#4.1 Show which region has the least people and how many it has. #data frame for means for each region

total_population_by_region<-data %>%
  group_by(region) %>%
  dplyr::summarize(sum = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
total_population_by_region <-as.data.frame(total_population_by_region[c(1,2)])

Findng region with minimum population

total_population_by_region %>%
  filter(sum == min(sum))
##      region sum
## 1  SSC20099   3
## 2  SSC20127   3
## 3  SSC20151   3
## 4  SSC20346   3
## 5  SSC20383   3
## 6  SSC20398   3
## 7  SSC20422   3
## 8  SSC20502   3
## 9  SSC20503   3
## 10 SSC20516   3
## 11 SSC21012   3
## 12 SSC21026   3
## 13 SSC21037   3
## 14 SSC21283   3
## 15 SSC22012   3
## 16 SSC22021   3
## 17 SSC22028   3
## 18 SSC22070   3
## 19 SSC22084   3
## 20 SSC22157   3
## 21 SSC22193   3
## 22 SSC22237   3
## 23 SSC22281   3
## 24 SSC22367   3
## 25 SSC22597   3
## 26 SSC22605   3
## 27 SSC22606   3
## 28 SSC22616   3
## 29 SSC22698   3
## 30 SSC22702   3
## 31 SSC22719   3
## 32 SSC22772   3
## 33 SSC22809   3
## 34 SSC22840   3
## 35 SSC22873   3

#4.2 Plot the distribution of ages for the region with the least people.

# Get cities wit minimum population 
least_population_cities<-total_population_by_region %>%
  filter(sum == 3)

# Joined with corresponsing ages
joined_table <- left_join(least_population_cities, data, 
                           by = c("region" = "region"))
# Dataset with ages 
age_dataset<- data.frame('Age'=rep((joined_table$age), joined_table$population))

# Histogram of the ages
qplot(age_dataset$Age,
      geom="histogram",
      binwidth = 5,  
      main = "Histogram for age of regions with the least people", 
      xlab = "Age",  
      fill=I("blue"), 
      col=I("black"), 
      alpha=I(.5))

#Q.5

# Findng region with maximum population 
highest_population_city<-total_population_by_region %>%
  filter(sum == max(sum))

# Joined with corresponsing ages
joined_table_2 <- left_join(highest_population_city, data, 
                           by = c("region" = "region"))

#5.1 Plot cumulative distribution for the regions with the most people.

ggplot(joined_table_2, aes(population)) + stat_ecdf(geom = "step")

#5.2 Plot the cumulative distribution for males and females on the same plot.

ggplot(joined_table_2,aes(x = population, colour= gender)) +
  stat_ecdf( geom = "step")

#Q.6

#Young People
Young_people<-data_for_summary_statistics %>%
  filter(Age < 40)
#Old people
older_people<-data_for_summary_statistics %>%
  filter(Age >= 40)
#get count of young and old people
Young_people_by_region<-Young_people %>%group_by(Region)%>%count(Region)
older_people_by_region<-older_people %>%group_by(Region)%>%count(Region)

# merge young and old people
young_older_people_count <-left_join(Young_people_by_region, older_people_by_region, 
                                     by = c("Region" = "Region"))

#Remove missing values
young_older_people_count<- na.omit(young_older_people_count)

#Ratio calculation
young_older_people_count$ratio<-young_older_people_count$n.y/young_older_people_count$n.x

# Merging population with raio
age_ratio <-left_join(total_population_by_region, young_older_people_count, 
                                     by = c("region" = "Region"))

#Remove missing values
age_ratio<- na.omit(age_ratio)

#Scatterplot
ggplot(age_ratio, aes(x = sum, y = ratio)) +
  geom_point() +labs(
    title = "Plot of old to young people ratio against population",
    x = "population",
    y = "Old to young people ratio")

#Q.7

#Data for gender calculation
data_for_gender_calculation<-data.frame('Region'=rep((data$region), data$population),
                                        'Gender'=rep((data$gender), data$population))

#Males
males<-data_for_gender_calculation %>%
  filter(Gender=='M')
#Females
females<-data_for_gender_calculation %>%
  filter(Gender=='F')

#get count of males and females
males_by_region<-males %>%group_by(Region)%>%count(Region)
females_by_region<-females %>%group_by(Region)%>%count(Region)

# merge young and old people
gender_count <-left_join(males_by_region, females_by_region, 
                                     by = c("Region" = "Region"))
#Remove missing values
gender_count<- na.omit(gender_count)

#Gender Ratio calculation
gender_count$ratio<-gender_count$n.x/gender_count$n.y

# Merging population with raio
gender_ratio <-left_join(total_population_by_region, gender_count, 
                      by = c("region" = "Region"))

#Remove missing values
gender_ratio<- na.omit(gender_ratio)

#Scatterplot
ggplot(gender_ratio, aes(x = sum, y = ratio)) +
  geom_point() +labs(
    title = "Plot of gender ratio (male to female) against population",
    x = "population",
    y = "male to female ratio")

#Q.8

# 8.1
#Calculation total population by gender
total_population_by_gender<-data %>%
  group_by(gender) %>%
  dplyr::summarize(sum = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
#Finding which gender groups have higher population 
print(total_population_by_gender %>% arrange(desc(sum)))
## # A tibble: 2 x 2
##   gender    sum
##   <chr>   <int>
## 1 F      400505
## 2 M      395510
#Calculation total population by age
total_population_by_age<-data %>%
  group_by(age) %>%
  dplyr::summarize(sum = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
#Finding which gender groups have higher population 
print(head(total_population_by_age %>% arrange(desc(sum))))
## # A tibble: 6 x 2
##     age   sum
##   <int> <int>
## 1    30 16823
## 2    31 16673
## 3    28 16655
## 4    33 16510
## 5    29 16462
## 6    26 16449
# 8.2

# Region with higher females
print(head(females_by_region %>% arrange(desc(n))))
## # A tibble: 6 x 2
## # Groups:   Region [6]
##   Region       n
##   <chr>    <int>
## 1 SSC22015 19303
## 2 SSC21671 11715
## 3 SSC21125 10220
## 4 SSC21143  9684
## 5 SSC20911  9654
## 6 SSC20773  9491
# Region with higher population with age group 28<= age <= 31
age_filtered_data<-data %>%
  filter(28<= age & age <= 31 )

# Total population 
total_population_by_region_age<-age_filtered_data %>%
  group_by(region, age) %>%
  dplyr::summarize(sum = sum(population))
## `summarise()` regrouping output by 'region' (override with `.groups` argument)
total_population_by_region_age <-as.data.frame(total_population_by_region_age[c(1,2,3)])

print(head(total_population_by_region_age%>% arrange(desc(sum))))
##     region age sum
## 1 SSC22015  30 937
## 2 SSC22015  31 906
## 3 SSC22015  28 803
## 4 SSC22015  29 791
## 5 SSC22569  31 731
## 6 SSC21143  29 667

#Q.9

#Selecting the region with higher population data
print(head(total_population_by_region%>% arrange(desc(sum))))
##     region   sum
## 1 SSC22015 37948
## 2 SSC21671 22979
## 3 SSC21125 20939
## 4 SSC20911 19340
## 5 SSC22569 19274
## 6 SSC21143 19180
age<- data %>%
  filter(region=='SSC22015')

data_for_age<-data.frame('Region'=rep((age$region), age$population),
                                        'Age'=rep((age$age), age$population))
# 9.1
# When N=1
n<-1
xbar<-rep(NA,n)
for (i in 1: n){
  
  mysample<-sample(data_for_age$Age, size = 100)
  xbar[i]<-mean(mysample)
}

mean(xbar)
## [1] 27.74
sd(xbar)
## [1] NA
# 9.2
# When N=2
n<-2
xbar<-rep(NA,n)
for (i in 1: n){
  
  mysample<-sample(data_for_age$Age, size = 100)
  xbar[i]<-mean(mysample)
}

mean(xbar)
## [1] 24.985
sd(xbar)
## [1] 1.407142
hist(xbar)
lines(density(xbar), col="blue", lwd=2)

# 9.3
# When N=10
n<-10
xbar<-rep(NA,n)
for (i in 1: n){
  
  mysample<-sample(data_for_age$Age, size = 100)
  xbar[i]<-mean(mysample)
}

mean(xbar)
## [1] 25.524
sd(xbar)
## [1] 1.677102
hist(xbar)
lines(density(xbar), col="blue", lwd=2)

# When N=100
n<-100
xbar<-rep(NA,n)
for (i in 1: n){
  
  mysample<-sample(data_for_age$Age, size = 100)
  xbar[i]<-mean(mysample)
}

mean(xbar)
## [1] 25.5372
sd(xbar)
## [1] 1.383116
hist(xbar)
lines(density(xbar), col="blue", lwd=2)

# When N=1000
n<-1000
xbar<-rep(NA,n)
for (i in 1: n){
  
  mysample<-sample(data_for_age$Age, size = 100)
  xbar[i]<-mean(mysample)
}

mean(xbar)
## [1] 25.49208
sd(xbar)
## [1] 1.606438
hist(xbar)
lines(density(xbar), col="blue", lwd=2)