This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
##performing Task A1 to perform data analysis of birth,death,tfr ,nom ## nim csv files
getwd()
## [1] "C:/Users/Dushyant/Downloads/FIT5145/assignment1/FIT5145 A1 Data"
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## Warning: package 'tidyr' was built under R version 4.0.3
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'purrr' was built under R version 4.0.3
## Warning: package 'dplyr' was built under R version 4.0.3
## Warning: package 'stringr' was built under R version 4.0.3
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
birth<-read_csv("./Births.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Year = col_double(),
## NSW = col_double(),
## ACT = col_double(),
## QLD = col_double(),
## SA = col_double(),
## WA = col_double(),
## TAS = col_double(),
## VIC = col_double(),
## NT = col_double()
## )
death<-read_csv("./Deaths.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Year = col_double(),
## NSW = col_double(),
## VIC = col_double(),
## QLD = col_double(),
## SA = col_double(),
## WA = col_double(),
## TAS = col_double(),
## NT = col_double(),
## ACT = col_double()
## )
tfr<-read_csv("./TFR.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Year = col_double(),
## NSW = col_double(),
## VIC = col_double(),
## QLD = col_double(),
## SA = col_double(),
## WA = col_double(),
## TAS = col_double(),
## NT = col_double(),
## ACT = col_double()
## )
nom<-read_csv("./NOM.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Year = col_double(),
## NSW = col_double(),
## VIC = col_double(),
## QLD = col_double(),
## SA = col_double(),
## WA = col_double(),
## TAS = col_double(),
## NT = col_double(),
## ACT = col_double()
## )
nim<-read_csv("./NIM.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Year = col_double(),
## NSW = col_double(),
## VIC = col_double(),
## QLD = col_double(),
## SA = col_double(),
## WA = col_double(),
## TAS = col_double(),
## NT = col_double(),
## ACT = col_double()
## )
##Plot the number of births recorded in each state/territory for ##different Australian states ##over different years. ##a. Describe the trend in number of births for Queensland and Tasmania ##for the period 1977 to 2016? ##b. Draw a bar chart to show the number of births in each Australian ##state in 2016.
births_long<- birth %>%
select(Year,NSW,SA,WA,ACT,VIC,TAS,QLD) %>%
gather(Pop_stats,Value_birth,-Year)
View(births_long)
#now we plot the no of births recorded in each state/territory
library(tidyverse)
births_long %>%
ggplot(aes(y=Value_birth,x=Year))+
geom_bar(stat="identity")
#birth trend in queensland and tasmania
births_long %>%
filter(Pop_stats==c("TAS","QLD")) %>%
ggplot(aes(y=Value_birth,x=Year))+
geom_boxplot()+
facet_wrap(~Pop_stats)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
#we can make use of bar plots to compare instaed of just using boxplot
births_long %>%
filter(Pop_stats==c("TAS","QLD")) %>%
ggplot(aes(x=Year,y=Value_birth))+
geom_bar(stat="identity")+
ggtitle("tas and ald population growth comparison")+
xlab("Year")+ylab("Value")
View(birth)
##2. Inspect the data on Total Fertility Rate (TFR.csv) for Queensland ##and Northern Territory. ##a. What was the minimum value for TFR recorded in the dataset for ##Queensland and when did that occur? ## b. What was the corresponding TFR value for Northern Territory in the ##same year?
tfr %>%
select(Year,QLD) %>%
min(tfr[,c("Year","QLD")])
## [1] 1.8
##plot the natural growth in Australia’s population over different years. ##For this, you will need to aggregate the total births and deaths by ##year. Describe the trend in natural growth in Australian ##population over time.
death_long <- death %>%
select(Year,NSW,VIC,NT,QLD,VIC,WA,TAS,ACT) %>%
gather(Pop_stats_death,Value_death,-Year)
View(death_long)
#using cbind to combine the two dataframes
birth_death<-cbind(death_long,births_long)
View(birth_death)
birth_death<-birth_death[!duplicated(as.list(birth_death))]
#now we visualise the population growth over the years
#we reove the duplicate column Year
View(birth_death)
birth_death %>%
ggplot(aes(x=Year,y=(Value_birth-Value_death)))+
geom_bar(stat="identity")
death_long <- death %>%
select(Year,NSW,VIC,NT,QLD,VIC,WA,TAS,ACT) %>%
gather(Pop_stats,Value,-Year)
death_sum <- death_long %>%
select(Year,Value) %>%
group_by(Year)
Death_sum <- death_sum %>%
aggregate(death_sum$Value,FUN="sum",by=list(death_sum$Year))
Death_sum <- Death_sum %>%
subset(select=c(Group.1,Value))
View(Death_sum)
colnames(Death_sum)[1]<-"Year"
#similarly we can group birth count for ever year
Birth_sum<- birth %>%
select(Year,NSW,VIC,QLD,SA,WA,ACT,NT) %>%
gather(Pop_stats,Value,-Year)
View(Birth_sum)
Birth_sum <- Birth_sum %>%
select(Year,Value) %>%
group_by(Year)
Birth_sum <- Birth_sum %>%
aggregate(Birth_sum$Value,FUN="sum",by=list(Birth_sum$Year))
View(Birth_sum)
Birth_sum <- Birth_sum %>%
subset(select=c(Group.1,Value))
colnames(Birth_sum)[1]<-"Year"
View(Birth_sum)
#natural population growth over the year from 1977 until 2016
Diff<- Birth_sum$Value-Death_sum$Value
View(Diff)
print(typeof(Diff))
## [1] "double"
NPG <- list(Death_sum$Year,Diff)
View(NPG)
NPG<- as.data.frame(NPG)
View(NPG)
colnames(NPG)[1]<-"Year"
colnames(NPG)[2]<-"Diff"
View(NPG)
NPG %>%
ggplot(aes(x=Year,y=Diff))+
geom_line()+
ggtitle("Natural population growth over the years")
nom %>%
ggplot(aes(x=Year,y=VIC))+
geom_line()+
ggtitle("for Victoria")
nom %>%
ggplot(aes(x=Year,y=WA))+
geom_line()+
ggtitle("for WA")
nom %>%
ggplot(aes(x=Year,y=TAS))+
geom_line()
ggtitle("for TAsmania")
## $title
## [1] "for TAsmania"
##
## attr(,"class")
## [1] "labels"
#moving on to the plot of net overseas migration
View(nom)
nom_long <- nom %>%
select(Year,NSW,VIC,QLD,SA,WA,ACT,NT) %>%
gather(Nom_states,Value,-Year)
View(nom_long)
nom_long %>%
ggplot(aes(x=Year,y=Value)) +
geom_bar(stat="identity")
## now we combine the nom and nim
nom_nim<-merge(nom,nim,by="Year")
View(nom_nim)
View(nom_nim)
#now we plot the scatter plot of the om and nim combined
nom_nim %>%
ggplot(aes(x=Year))+
geom_line(aes(y=QLD.x,color="green"))+
geom_line(aes(y=QLD.y,color="red"))+
ggtitle("nim and nom comparison for qld for the years")+
ylab("QLD.x and QLD.y")
#but we need a sctter plot
nom_nim %>%
ggplot(aes(x=Year))+
geom_point(aes(y=QLD.x,color="green"))+
geom_point(aes(y=QLD.y,color="red"))+
ggtitle("nim and nom comparison for qld for the years")+
ylab("QLD.x and QLD.y")
##d. Finally, plot the Net Interstate Migration (NIM) for Queensland and ##New South Wales over different years. Note graphs for both QLD and NSW ##should be on the same plot. Compare these two states on the plot. What ##can you infer from the trend you see for these two states? Discuss your ##findings
nim %>%
ggplot(aes(x=Year))+
geom_line(aes(y=QLD,colour="green"))+
geom_line(aes(y=NSW,colour="brown"))+
ggtitle("nim for QLD and NSW combined together")+
ylab("QLD AND NSW data")+xlab("Year")
`
library(tidyverse)
twitter<-read_csv("./twitter_data.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double()
## )
## i Use `spec()` for the full column specifications.
View(twitter)
## total no of tweets and hence we use count function
count(twitter)
## # A tibble: 1 x 1
## n
## <int>
## 1 39955
##now we look for the tweets and their counts which are verified
verified_tweets<-twitter %>%
filter(isVerified==1)
count(verified_tweets)
## # A tibble: 1 x 1
## n
## <int>
## 1 218
twitter %>%
ggplot(aes(x=twitter$`#entities`))+
geom_histogram(binwidth = 0.5)+
ggtitle("plot of entities using histogram")+
xlab("entities")+ylab("count")
#for relevant judge=1
new_twitter <- twitter %>%
filter(relevanceJudge==1)
summary(new_twitter)
## text_score text_score_expansion hashtag hasURL
## Min. :-16.000 Min. :-16.000 Min. :0.000 Min. :0.000
## 1st Qu.:-16.000 1st Qu.:-16.000 1st Qu.:0.000 1st Qu.:1.000
## Median : -9.160 Median : -8.567 Median :0.000 Median :1.000
## Mean :-10.699 Mean :-10.298 Mean :0.191 Mean :0.819
## 3rd Qu.: -7.982 3rd Qu.: -7.492 3rd Qu.:0.000 3rd Qu.:1.000
## Max. : -5.717 Max. : -4.598 Max. :1.000 Max. :1.000
## isReply length tweet_topic_time_diff semantic_overlap
## Min. :0.00000 Min. : 0.00 Min. : 0.000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.: 62.00 1st Qu.: 0.000 1st Qu.:0.0000
## Median :0.00000 Median : 99.00 Median : 3.000 Median :0.0000
## Mean :0.03372 Mean : 90.28 Mean : 4.432 Mean :0.2535
## 3rd Qu.:0.00000 3rd Qu.:116.00 3rd Qu.: 8.000 3rd Qu.:1.0000
## Max. :1.00000 Max. :141.00 Max. :16.000 Max. :1.0000
## #entityTypes #entities organization_entities person_entities
## Min. :0.0000 Min. : 0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 1.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median : 2.000 Median :0.0000 Median :0.0000
## Mean :0.7955 Mean : 2.367 Mean :0.3166 Mean :0.2762
## 3rd Qu.:1.0000 3rd Qu.: 3.000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :3.0000 Max. :10.000 Max. :4.0000 Max. :4.0000
## work_entities event_entities species_entities places_entities
## Min. :0.0000 Min. :0.00000 Min. :0.000000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :0.000000 Median :0.0000
## Mean :0.1878 Mean :0.00071 Mean :0.005325 Mean :0.1761
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.0000
## Max. :4.0000 Max. :1.00000 Max. :3.000000 Max. :4.0000
## nFollowers nFriends nFavorties nListed
## Min. : 0 Min. : 0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 190 1st Qu.: 64 1st Qu.: 0.0 1st Qu.: 3.0
## Median : 649 Median : 359 Median : 1.0 Median : 14.0
## Mean : 6502 Mean : 1802 Mean : 184.3 Mean : 209.1
## 3rd Qu.: 2173 3rd Qu.: 1448 3rd Qu.: 11.0 3rd Qu.: 61.0
## Max. :1967317 Max. :181629 Max. :57064.0 Max. :51578.0
## isVerified isGeoEnabled twitterAge #tweetsPosted
## Min. :0.00000 Min. :0.0000 Min. :0.000 Min. : 0
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:1.666 1st Qu.: 2988
## Median :0.00000 Median :0.0000 Median :2.383 Median : 12094
## Mean :0.01136 Mean :0.2258 Mean :2.351 Mean : 29863
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:2.952 3rd Qu.: 34790
## Max. :1.00000 Max. :1.0000 Max. :5.596 Max. :545006
## relevanceJudge
## Min. :1
## 1st Qu.:1
## Median :1
## Mean :1
## 3rd Qu.:1
## Max. :1
#for relevant judge =0
twitter_judge_zero<- twitter %>%
filter(relevanceJudge==0)
summary(twitter_judge_zero)
## text_score text_score_expansion hashtag hasURL
## Min. :-16.000 Min. :-16.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:-16.000 1st Qu.:-16.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :-16.000 Median :-16.000 Median :0.0000 Median :1.0000
## Mean :-14.406 Mean :-14.261 Mean :0.1935 Mean :0.5417
## 3rd Qu.:-16.000 3rd Qu.:-16.000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. : -5.588 Max. : -4.501 Max. :1.0000 Max. :1.0000
## isReply length tweet_topic_time_diff semantic_overlap
## Min. :0.0000 Min. : 0.00 Min. : 0.000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.: 58.00 1st Qu.: 0.000 1st Qu.:0.00000
## Median :0.0000 Median : 96.00 Median : 2.000 Median :0.00000
## Mean :0.1415 Mean : 87.82 Mean : 3.572 Mean :0.04653
## 3rd Qu.:0.0000 3rd Qu.:116.00 3rd Qu.: 6.000 3rd Qu.:0.00000
## Max. :1.0000 Max. :255.00 Max. :16.000 Max. :1.00000
## #entityTypes #entities organization_entities person_entities
## Min. :0.0000 Min. : 0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median : 2.000 Median :0.0000 Median :0.0000
## Mean :0.5973 Mean : 1.882 Mean :0.1882 Mean :0.1814
## 3rd Qu.:1.0000 3rd Qu.: 3.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :4.0000 Max. :11.000 Max. :8.0000 Max. :8.0000
## work_entities event_entities species_entities places_entities
## Min. : 0.0000 Min. :0.000000 Min. :0.00000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.0000
## Median : 0.0000 Median :0.000000 Median :0.00000 Median :0.0000
## Mean : 0.2454 Mean :0.004254 Mean :0.01193 Mean :0.1165
## 3rd Qu.: 0.0000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :12.0000 Max. :2.000000 Max. :5.00000 Max. :9.0000
## nFollowers nFriends nFavorties nListed
## Min. : 0 Min. : 0.0 Min. : 0 Min. : 0
## 1st Qu.: 149 1st Qu.: 78.0 1st Qu.: 0 1st Qu.: 2
## Median : 472 Median : 288.0 Median : 2 Median : 9
## Mean : 4162 Mean : 1267.8 Mean : 186 Mean : 101
## 3rd Qu.: 1432 3rd Qu.: 902.8 3rd Qu.: 25 3rd Qu.: 37
## Max. :4853601 Max. :561555.0 Max. :551473 Max. :97531
## isVerified isGeoEnabled twitterAge #tweetsPosted
## Min. :0.000000 Min. :0.0000 Min. :0.000 Min. : 0
## 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:1.556 1st Qu.: 2481
## Median :0.000000 Median :0.0000 Median :2.214 Median : 10184
## Mean :0.005008 Mean :0.2472 Mean :2.207 Mean : 28889
## 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:2.834 3rd Qu.: 29962
## Max. :1.000000 Max. :1.0000 Max. :5.624 Max. :1399152
## relevanceJudge
## Min. :0
## 1st Qu.:0
## Median :0
## Mean :0
## 3rd Qu.:0
## Max. :0
new_twitter %>%
summarise(avg=mean(length),na.rm=TRUE)
## # A tibble: 1 x 2
## avg na.rm
## <dbl> <lgl>
## 1 90.3 TRUE
twitter_judge_zero %>%
summarise(avg=mean(length),na.rm=TRUE)
## # A tibble: 1 x 2
## avg na.rm
## <dbl> <lgl>
## 1 87.8 TRUE
twitter_age_range<-
cut(twitter$twitterAge, breaks = c(1, 2, 3, 4, 5,6),
labels = c("1-2", "2-3", "3-4", "4-5","5-6"),
include.lowest = TRUE,na.rm=TRUE)
View(twitter)
twitter<-cbind(twitter_age_range,twitter)
second_twitter<-twitter %>%
select(twitter_age_range,twitterAge,`#tweetsPosted`,length) %>%
gather(count,value,-twitter_age_range,-`#tweetsPosted`,-length)
View(second_twitter)
# now we plot the boxplot for each age range
#first we find the median length and then plot the boxplot for
# each age range
median_length <- second_twitter %>%
summarise(x=median(length))
View(median_length)
print(typeof(median_length))
## [1] "list"
median_length<- as.factor(median_length)
twitter %>%
ggplot(aes(y=twitter_age_range,x=median_length))+
geom_boxplot()+
facet_wrap(~twitter_age_range,scales="free")
##now we move on plot which group is more active in tweeting and ## the age group which has highest no of followers
twitter %>%
ggplot(aes(x=twitter_age_range,y='#tweetsPosted',na.rm=TRUE))+
geom_bar(stat="identity")+
xlab("age range")+
ylab("tweets posted")
twitter %>%
select(twitter_age_range,nFollowers) %>%
ggplot(aes(x=twitter_age_range,y=nFollowers))+
geom_bar(stat="identity",fill="green")+
ggtitle("bar plot to show the no of followers")+
xlab("age range")+ylab("no of followers")