Assignment 1 backup

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

##performing Task A1 to perform data analysis of birth,death,tfr ,nom ## nim csv files

getwd()

## [1] "C:/Users/Dushyant/Downloads/FIT5145/assignment1/FIT5145 A1 Data"

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.0.3

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0

## Warning: package 'tidyr' was built under R version 4.0.3

## Warning: package 'readr' was built under R version 4.0.3

## Warning: package 'purrr' was built under R version 4.0.3

## Warning: package 'dplyr' was built under R version 4.0.3

## Warning: package 'stringr' was built under R version 4.0.3

## Warning: package 'forcats' was built under R version 4.0.3

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

birth<-read_csv("./Births.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   Year = col_double(),
##   NSW = col_double(),
##   ACT = col_double(),
##   QLD = col_double(),
##   SA = col_double(),
##   WA = col_double(),
##   TAS = col_double(),
##   VIC = col_double(),
##   NT = col_double()
## )

death<-read_csv("./Deaths.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   Year = col_double(),
##   NSW = col_double(),
##   VIC = col_double(),
##   QLD = col_double(),
##   SA = col_double(),
##   WA = col_double(),
##   TAS = col_double(),
##   NT = col_double(),
##   ACT = col_double()
## )

tfr<-read_csv("./TFR.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   Year = col_double(),
##   NSW = col_double(),
##   VIC = col_double(),
##   QLD = col_double(),
##   SA = col_double(),
##   WA = col_double(),
##   TAS = col_double(),
##   NT = col_double(),
##   ACT = col_double()
## )

nom<-read_csv("./NOM.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   Year = col_double(),
##   NSW = col_double(),
##   VIC = col_double(),
##   QLD = col_double(),
##   SA = col_double(),
##   WA = col_double(),
##   TAS = col_double(),
##   NT = col_double(),
##   ACT = col_double()
## )

nim<-read_csv("./NIM.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   Year = col_double(),
##   NSW = col_double(),
##   VIC = col_double(),
##   QLD = col_double(),
##   SA = col_double(),
##   WA = col_double(),
##   TAS = col_double(),
##   NT = col_double(),
##   ACT = col_double()
## )

##Plot the number of births recorded in each state/territory for ##different Australian states ##over different years. ##a. Describe the trend in number of births for Queensland and Tasmania ##for the period 1977 to 2016? ##b. Draw a bar chart to show the number of births in each Australian ##state in 2016.

births_long<- birth %>%
  select(Year,NSW,SA,WA,ACT,VIC,TAS,QLD) %>%
  gather(Pop_stats,Value_birth,-Year)

View(births_long)
#now we plot the no of births recorded in each state/territory
library(tidyverse)
births_long %>%
  ggplot(aes(y=Value_birth,x=Year))+
  geom_bar(stat="identity")

#birth trend in queensland and tasmania
births_long %>%
  filter(Pop_stats==c("TAS","QLD")) %>%
  ggplot(aes(y=Value_birth,x=Year))+
  geom_boxplot()+
  facet_wrap(~Pop_stats)

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

#we can make use of bar plots to compare instaed of just using boxplot
births_long %>%
  filter(Pop_stats==c("TAS","QLD")) %>%
  ggplot(aes(x=Year,y=Value_birth))+
  geom_bar(stat="identity")+
  ggtitle("tas and ald population growth comparison")+
  xlab("Year")+ylab("Value")

View(birth)

##2. Inspect the data on Total Fertility Rate (TFR.csv) for Queensland ##and Northern Territory. ##a. What was the minimum value for TFR recorded in the dataset for ##Queensland and when did that occur? ## b. What was the corresponding TFR value for Northern Territory in the ##same year?

tfr %>%
  select(Year,QLD) %>%
  min(tfr[,c("Year","QLD")])

## [1] 1.8

##plot the natural growth in Australia’s population over different years. ##For this, you will need to aggregate the total births and deaths by ##year. Describe the trend in natural growth in Australian ##population over time.

death_long <- death %>%
  select(Year,NSW,VIC,NT,QLD,VIC,WA,TAS,ACT) %>%
  gather(Pop_stats_death,Value_death,-Year)
View(death_long)
#using cbind to combine the two dataframes
birth_death<-cbind(death_long,births_long)
View(birth_death)
birth_death<-birth_death[!duplicated(as.list(birth_death))]
#now we visualise the population growth over the years
#we reove the duplicate column Year
View(birth_death)
birth_death %>%
  ggplot(aes(x=Year,y=(Value_birth-Value_death)))+
           geom_bar(stat="identity")

death_long <- death %>%
  select(Year,NSW,VIC,NT,QLD,VIC,WA,TAS,ACT) %>%
  gather(Pop_stats,Value,-Year)

death_sum <- death_long %>%
  select(Year,Value) %>%
  group_by(Year)
Death_sum <- death_sum %>%
  aggregate(death_sum$Value,FUN="sum",by=list(death_sum$Year))

Death_sum <- Death_sum %>%
  subset(select=c(Group.1,Value))
View(Death_sum)
colnames(Death_sum)[1]<-"Year"

#similarly we can group birth count for ever year
Birth_sum<- birth %>%
  select(Year,NSW,VIC,QLD,SA,WA,ACT,NT) %>%
  gather(Pop_stats,Value,-Year)
View(Birth_sum)

Birth_sum <- Birth_sum %>%
  select(Year,Value) %>%
  group_by(Year)
Birth_sum <- Birth_sum %>%
  aggregate(Birth_sum$Value,FUN="sum",by=list(Birth_sum$Year))
View(Birth_sum)
Birth_sum <- Birth_sum %>%
  subset(select=c(Group.1,Value))
colnames(Birth_sum)[1]<-"Year"
View(Birth_sum)
#natural population growth over the year from 1977 until 2016
Diff<- Birth_sum$Value-Death_sum$Value
View(Diff)
print(typeof(Diff))

## [1] "double"

NPG <- list(Death_sum$Year,Diff)
View(NPG)
NPG<- as.data.frame(NPG)
View(NPG)
colnames(NPG)[1]<-"Year"
colnames(NPG)[2]<-"Diff"
View(NPG)
NPG %>%
  ggplot(aes(x=Year,y=Diff))+
  geom_line()+
  ggtitle("Natural population growth over the years")

now we move on to the task A2

in this task we are going to focus NOM(net overseas migration)

and NIM(net interstate migration)

nom %>%
  ggplot(aes(x=Year,y=VIC))+
  geom_line()+
  ggtitle("for Victoria")

nom %>%
  ggplot(aes(x=Year,y=WA))+
  geom_line()+
  ggtitle("for WA")

nom %>%
  ggplot(aes(x=Year,y=TAS))+
  geom_line()

  ggtitle("for TAsmania")

## $title
## [1] "for TAsmania"
## 
## attr(,"class")
## [1] "labels"

#moving on to the plot of net overseas migration

View(nom)
nom_long <- nom %>%
  select(Year,NSW,VIC,QLD,SA,WA,ACT,NT) %>%
  gather(Nom_states,Value,-Year)
View(nom_long)
nom_long %>%
  ggplot(aes(x=Year,y=Value)) +
  geom_bar(stat="identity")

## now we combine the nom and nim

nom_nim<-merge(nom,nim,by="Year")
View(nom_nim)


View(nom_nim)
#now we plot the scatter plot of the  om and nim combined 

nom_nim %>%
  ggplot(aes(x=Year))+
  geom_line(aes(y=QLD.x,color="green"))+
  geom_line(aes(y=QLD.y,color="red"))+
  ggtitle("nim and nom comparison for qld for the years")+
  ylab("QLD.x and QLD.y")

#but we need a sctter plot
nom_nim %>%
  ggplot(aes(x=Year))+
  geom_point(aes(y=QLD.x,color="green"))+
  geom_point(aes(y=QLD.y,color="red"))+
  ggtitle("nim and nom comparison for qld for the years")+
  ylab("QLD.x and QLD.y")

##d. Finally, plot the Net Interstate Migration (NIM) for Queensland and ##New South Wales over different years. Note graphs for both QLD and NSW ##should be on the same plot. Compare these two states on the plot. What ##can you infer from the trend you see for these two states? Discuss your ##findings

nim %>%
  ggplot(aes(x=Year))+
  geom_line(aes(y=QLD,colour="green"))+
  geom_line(aes(y=NSW,colour="brown"))+
  ggtitle("nim for QLD and NSW combined together")+
  ylab("QLD AND NSW data")+xlab("Year")

Now after reading the twitter dataset we perform the operations on

the dataset

library(tidyverse)
twitter<-read_csv("./twitter_data.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double()
## )
## i Use `spec()` for the full column specifications.

View(twitter)
## total no of tweets and hence we use count function
count(twitter)

## # A tibble: 1 x 1
##       n
##   <int>
## 1 39955

##now we look for the tweets and their counts which are verified 
verified_tweets<-twitter %>%
  filter(isVerified==1)
count(verified_tweets)

## # A tibble: 1 x 1
##       n
##   <int>
## 1   218

plotting of histogram to show entities

twitter %>%
  ggplot(aes(x=twitter$`#entities`))+
  geom_histogram(binwidth = 0.5)+
  ggtitle("plot of entities using histogram")+
  xlab("entities")+ylab("count")

Now we compute the descriptive statistics of #entities of ##relevavtJudge=1 and relevantJudge=0

#for relevant judge=1
new_twitter <- twitter %>%
  filter(relevanceJudge==1)
summary(new_twitter)

##    text_score      text_score_expansion    hashtag          hasURL     
##  Min.   :-16.000   Min.   :-16.000      Min.   :0.000   Min.   :0.000  
##  1st Qu.:-16.000   1st Qu.:-16.000      1st Qu.:0.000   1st Qu.:1.000  
##  Median : -9.160   Median : -8.567      Median :0.000   Median :1.000  
##  Mean   :-10.699   Mean   :-10.298      Mean   :0.191   Mean   :0.819  
##  3rd Qu.: -7.982   3rd Qu.: -7.492      3rd Qu.:0.000   3rd Qu.:1.000  
##  Max.   : -5.717   Max.   : -4.598      Max.   :1.000   Max.   :1.000  
##     isReply            length       tweet_topic_time_diff semantic_overlap
##  Min.   :0.00000   Min.   :  0.00   Min.   : 0.000        Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.: 62.00   1st Qu.: 0.000        1st Qu.:0.0000  
##  Median :0.00000   Median : 99.00   Median : 3.000        Median :0.0000  
##  Mean   :0.03372   Mean   : 90.28   Mean   : 4.432        Mean   :0.2535  
##  3rd Qu.:0.00000   3rd Qu.:116.00   3rd Qu.: 8.000        3rd Qu.:1.0000  
##  Max.   :1.00000   Max.   :141.00   Max.   :16.000        Max.   :1.0000  
##   #entityTypes      #entities      organization_entities person_entities 
##  Min.   :0.0000   Min.   : 0.000   Min.   :0.0000        Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 1.000   1st Qu.:0.0000        1st Qu.:0.0000  
##  Median :1.0000   Median : 2.000   Median :0.0000        Median :0.0000  
##  Mean   :0.7955   Mean   : 2.367   Mean   :0.3166        Mean   :0.2762  
##  3rd Qu.:1.0000   3rd Qu.: 3.000   3rd Qu.:1.0000        3rd Qu.:0.0000  
##  Max.   :3.0000   Max.   :10.000   Max.   :4.0000        Max.   :4.0000  
##  work_entities    event_entities    species_entities   places_entities 
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.000000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.00000   Median :0.000000   Median :0.0000  
##  Mean   :0.1878   Mean   :0.00071   Mean   :0.005325   Mean   :0.1761  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:0.0000  
##  Max.   :4.0000   Max.   :1.00000   Max.   :3.000000   Max.   :4.0000  
##    nFollowers         nFriends        nFavorties         nListed       
##  Min.   :      0   Min.   :     0   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:    190   1st Qu.:    64   1st Qu.:    0.0   1st Qu.:    3.0  
##  Median :    649   Median :   359   Median :    1.0   Median :   14.0  
##  Mean   :   6502   Mean   :  1802   Mean   :  184.3   Mean   :  209.1  
##  3rd Qu.:   2173   3rd Qu.:  1448   3rd Qu.:   11.0   3rd Qu.:   61.0  
##  Max.   :1967317   Max.   :181629   Max.   :57064.0   Max.   :51578.0  
##    isVerified       isGeoEnabled      twitterAge    #tweetsPosted   
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.000   Min.   :     0  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:1.666   1st Qu.:  2988  
##  Median :0.00000   Median :0.0000   Median :2.383   Median : 12094  
##  Mean   :0.01136   Mean   :0.2258   Mean   :2.351   Mean   : 29863  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:2.952   3rd Qu.: 34790  
##  Max.   :1.00000   Max.   :1.0000   Max.   :5.596   Max.   :545006  
##  relevanceJudge
##  Min.   :1     
##  1st Qu.:1     
##  Median :1     
##  Mean   :1     
##  3rd Qu.:1     
##  Max.   :1

#for relevant judge =0
twitter_judge_zero<- twitter %>%
  filter(relevanceJudge==0)
summary(twitter_judge_zero)

##    text_score      text_score_expansion    hashtag           hasURL      
##  Min.   :-16.000   Min.   :-16.000      Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:-16.000   1st Qu.:-16.000      1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :-16.000   Median :-16.000      Median :0.0000   Median :1.0000  
##  Mean   :-14.406   Mean   :-14.261      Mean   :0.1935   Mean   :0.5417  
##  3rd Qu.:-16.000   3rd Qu.:-16.000      3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   : -5.588   Max.   : -4.501      Max.   :1.0000   Max.   :1.0000  
##     isReply           length       tweet_topic_time_diff semantic_overlap 
##  Min.   :0.0000   Min.   :  0.00   Min.   : 0.000        Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.: 58.00   1st Qu.: 0.000        1st Qu.:0.00000  
##  Median :0.0000   Median : 96.00   Median : 2.000        Median :0.00000  
##  Mean   :0.1415   Mean   : 87.82   Mean   : 3.572        Mean   :0.04653  
##  3rd Qu.:0.0000   3rd Qu.:116.00   3rd Qu.: 6.000        3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :255.00   Max.   :16.000        Max.   :1.00000  
##   #entityTypes      #entities      organization_entities person_entities 
##  Min.   :0.0000   Min.   : 0.000   Min.   :0.0000        Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 0.000   1st Qu.:0.0000        1st Qu.:0.0000  
##  Median :0.0000   Median : 2.000   Median :0.0000        Median :0.0000  
##  Mean   :0.5973   Mean   : 1.882   Mean   :0.1882        Mean   :0.1814  
##  3rd Qu.:1.0000   3rd Qu.: 3.000   3rd Qu.:0.0000        3rd Qu.:0.0000  
##  Max.   :4.0000   Max.   :11.000   Max.   :8.0000        Max.   :8.0000  
##  work_entities     event_entities     species_entities  places_entities 
##  Min.   : 0.0000   Min.   :0.000000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.: 0.0000   1st Qu.:0.000000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median : 0.0000   Median :0.000000   Median :0.00000   Median :0.0000  
##  Mean   : 0.2454   Mean   :0.004254   Mean   :0.01193   Mean   :0.1165  
##  3rd Qu.: 0.0000   3rd Qu.:0.000000   3rd Qu.:0.00000   3rd Qu.:0.0000  
##  Max.   :12.0000   Max.   :2.000000   Max.   :5.00000   Max.   :9.0000  
##    nFollowers         nFriends          nFavorties        nListed     
##  Min.   :      0   Min.   :     0.0   Min.   :     0   Min.   :    0  
##  1st Qu.:    149   1st Qu.:    78.0   1st Qu.:     0   1st Qu.:    2  
##  Median :    472   Median :   288.0   Median :     2   Median :    9  
##  Mean   :   4162   Mean   :  1267.8   Mean   :   186   Mean   :  101  
##  3rd Qu.:   1432   3rd Qu.:   902.8   3rd Qu.:    25   3rd Qu.:   37  
##  Max.   :4853601   Max.   :561555.0   Max.   :551473   Max.   :97531  
##    isVerified        isGeoEnabled      twitterAge    #tweetsPosted    
##  Min.   :0.000000   Min.   :0.0000   Min.   :0.000   Min.   :      0  
##  1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:1.556   1st Qu.:   2481  
##  Median :0.000000   Median :0.0000   Median :2.214   Median :  10184  
##  Mean   :0.005008   Mean   :0.2472   Mean   :2.207   Mean   :  28889  
##  3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:2.834   3rd Qu.:  29962  
##  Max.   :1.000000   Max.   :1.0000   Max.   :5.624   Max.   :1399152  
##  relevanceJudge
##  Min.   :0     
##  1st Qu.:0     
##  Median :0     
##  Mean   :0     
##  3rd Qu.:0     
##  Max.   :0

Average length of tweets that are judged as relevant and

average length of tweets which are judged as non relevant

new_twitter %>%
  summarise(avg=mean(length),na.rm=TRUE)

## # A tibble: 1 x 2
##     avg na.rm
##   <dbl> <lgl>
## 1  90.3 TRUE

twitter_judge_zero %>%
  summarise(avg=mean(length),na.rm=TRUE)

## # A tibble: 1 x 2
##     avg na.rm
##   <dbl> <lgl>
## 1  87.8 TRUE

Dividing the twitter age into categorical bins

into each group

twitter_age_range<- 
  cut(twitter$twitterAge, breaks = c(1, 2, 3, 4, 5,6),
      labels = c("1-2", "2-3", "3-4", "4-5","5-6"),
      include.lowest = TRUE,na.rm=TRUE)
View(twitter)

twitter<-cbind(twitter_age_range,twitter)
second_twitter<-twitter %>%
  select(twitter_age_range,twitterAge,`#tweetsPosted`,length) %>%
  gather(count,value,-twitter_age_range,-`#tweetsPosted`,-length)
View(second_twitter)
# now we plot the boxplot for each age range
#first we find the median length and then plot the boxplot for 
# each age range
median_length <- second_twitter %>%
  summarise(x=median(length))
View(median_length)
print(typeof(median_length))

## [1] "list"

median_length<- as.factor(median_length)
twitter %>%
  ggplot(aes(y=twitter_age_range,x=median_length))+
  geom_boxplot()+
  facet_wrap(~twitter_age_range,scales="free")

##now we move on plot which group is more active in tweeting and ## the age group which has highest no of followers

twitter %>%
  ggplot(aes(x=twitter_age_range,y='#tweetsPosted',na.rm=TRUE))+
  geom_bar(stat="identity")+
  xlab("age range")+
  ylab("tweets posted")

twitter %>%
  select(twitter_age_range,nFollowers) %>%
  ggplot(aes(x=twitter_age_range,y=nFollowers))+
  geom_bar(stat="identity",fill="green")+
  ggtitle("bar plot to show the no of followers")+
  xlab("age range")+ylab("no of followers")