Description of the Dataset

This document contains analysis of a dataset from Toronto CMA containing 4263 records on 241 different variables. These variables range from demographic variables such as those describing age, gender and ethnicity to variables about life style such as household income and type of dwelling as well as variables about preferences of the subjects such as internet last used and number of mall visits per week. Some of these are categorical variables, whereas others are quantitative variables. The following analysis is an attempt gather some insights from the data provided.

Loading the required R packages

library(ggplot2)
library(reshape2)
library(plyr)

Reading the data into R

setwd("C:\\Users\\Alina\\OneDrive\\R")
mydata<-read.csv("torontoCMA.csv")

Graphs and Analysis

par(mfrow=c(1,2))

# Age and Gender Bar Graph
ggplot(mydata,aes(x=factor(Age), fill=factor(Gender)))+
  geom_bar(position="dodge")+
  scale_x_discrete("Age Groups", 
                   labels=c("1"="12-17","2"="18-24","3"="25-34",
                            "4"="35-49","5"="50-64","6"="65+"))+
  scale_fill_discrete(name="Gender",
                      breaks=c("1", "2"),
                      labels=c("Male", "Female"))+
  ggtitle("Age Groups by Gender")+
  ylab('No. of People')


# Education bar graph
qplot(as.factor(mydata$Education), geom='bar', fill=as.factor(mydata$Gender))+
  coord_flip()+
  scale_x_discrete("Education Levels", 
                   labels=c("1"="No Cert","2"="Secondary School Grad",
                            "3"="Certificate/Diploma","4"="University Cert",
                            "5"="Bachelors Degree","6"="Post Grad Degree"))+
  scale_fill_discrete(name="Gender",
                      breaks=c("1", "2"),
                      labels=c("Male", "Female"))+
  ggtitle("Education Levels by Gender")+ylab('No. of People')

The figures show the distribution of the population of Toronto by gender, age group and education levels. The left panel shows that there are more females than males overall, and the most populous age group is the 50-64 age group. More than half of the population is of ages 35 and above (median and mean). Whereas the least number of people are in the 12-17 age group. The right panel shows that education levels in Toronto range from having no certificate or diploma to having a post graduate degree. A Bachelor’s Degree is the most common qualification, and most people have either a (university or non-university) certificate or a higher qualification (25th percentile).

par(mfrow=c(1,2))

# Pie chart for Languages Spoken
mydata$LangSum <- reorder(mydata$LangSum, X = mydata$LangSum, 
                          FUN = function(x) -length(x))

at <- nrow(mydata) - as.numeric(cumsum(sort(table(mydata$LangSum)))-
                                  0.5*sort(table(mydata$LangSum)))

label=paste0(round(sort(table(mydata$LangSum))/sum(table(mydata$LangSum)),2) * 100,"%")

p <- ggplot(mydata,aes(x="", fill = LangSum)) +
  geom_bar(width = 1) +
  scale_fill_discrete(name="Language Spoken",
                      breaks=c("1", "2","3"),
                      labels=c("English", "French","Other"))+
  coord_polar(theta="y") +
  annotate(geom = "text", y = at, x = 1, label = label, size=4)+
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.grid  = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank())+
  ggtitle("Distribution of Languages \n Spoken")
print(p)


# Pie chart for Marital Status
mydata$MaritalStatus <- reorder(mydata$MaritalStatus, X = mydata$MaritalStatus,
                                FUN = function(x) -length(x))

at <- nrow(mydata) - as.numeric(cumsum(sort(table(mydata$MaritalStatus)))-
                                  0.5*sort(table(mydata$MaritalStatus)))

label=paste0(round(sort(table(mydata$MaritalStatus))/
                     sum(table(mydata$MaritalStatus)),2) * 100,"%")


p <- ggplot(mydata,aes(x="", fill = MaritalStatus)) +
  geom_bar(width = 1) +
  
  scale_fill_discrete(name="Marital Status",
                      breaks=c("1", "2"),
                      labels=c("Married/Living Together", "Single"))+
  coord_polar(theta="y") +
  annotate(geom = "text", y = at, x = 1, label = label, size=4)+
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.grid  = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank())+
  ggtitle("Marital Status")
print(p)

The two figures above show the distribution of the population of Toronto by languages spoken and marital status. The left panel shows that English seems to be the most common language spoken in Toronto, but a number of people also speak other languages including French. According to the pie chart on the right, about half of the population in the city is married or living with a partner.

par(mfrow=c(1,2))

#Employment Pie Chart
mydata$EmploySum <- reorder(mydata$EmploySum, X = mydata$EmploySum, 
                            FUN = function(x) -length(x))

at <- nrow(mydata) - as.numeric(cumsum(sort(table(mydata$EmploySum)))-
                                  0.5*sort(table(mydata$EmploySum)))

label=paste0(round(sort(table(mydata$EmploySum))/
                     sum(table(mydata$EmploySum)),2) * 100,"%")

p <- ggplot(mydata,aes(x="", fill = EmploySum)) +
  geom_bar(width = 1) +
  scale_fill_discrete(name="Employment",
                      breaks=c("1","2","3"),
                      labels=c("Full-time", "Part-time","Unemployed"))+
  coord_polar(theta="y") +
  annotate(geom = "text", y = at, x = 1, label = label,size=4)+
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.grid  = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank())+
  ggtitle("Employment Type")
print(p)

#personal income bar chart
df <- data.frame(matrix(ncol = 2, nrow = 12)) 
colnames(df)<-c('Income','Frequency')
df$Income<-c('0','under 25000', '25000-34999','35000-39999','40000-49999',
           '50000-59999','60000-74999','75000-99999','100000-124999',
           '125000-149999','150000-199999','200000+')
for (i in 1:12){df[i,2]<-sum(mydata[,77+i])}

df$Income<-factor(df$Income, levels=df$Income) #to prevent reordering of bars


ggplot(df,aes(x=Income, y=Frequency), fill=df$Income)+
  geom_bar(stat="identity")+
  scale_x_discrete("Personal Income Groups")+
  coord_flip()+
  ggtitle('Personal Income Distribution')+
  ylab('No. of People')

The graphs above show the population distribution of Toronto by employment type and personal income. As shown in the left panel above, more than half of the population is occupied in full-time jobs, about 10% have part-time jobs and the rest are unemployed. The unemployed group is significant as it also includes those people who are too old to work (65+ age group). According to the graph on the right, people in Toronto earn between 0 and 200000+ dollars. On average a person in Toronto will earn about 53000 dollars (mean) and three quarters of the population earns 67500 dollars or less (75th percentile).

From the above information it can be concluded that advertising aimed at females, people between the ages of 35-64 and English speakers has a better chance of being effective in Toronto. A significant proportion of the population is educated up to a university certificate or more, so advertising and media sources should be designed for a well-educated audience. About half of the population is married so products/advertising aimed at married couples as well as single people are likely to do equally well. Products that target a variety of income groups under 67500 dollars could be popular. However, there is a small group of people that earn more than 100000 dollars. This might be of interest to brands that sell high value products.

par(mfrow=c(1,2))

#magazine last read cumulative
df <- data.frame(matrix(ncol = 2, nrow =8 )) 
colnames(df)<-c('Time','Cumulative_Frequency')
df$Time<-c('1 day', '7 days','30 days','60 days','90 days','365 days','365+ days','Not Stated')
for (i in 1:8){df[i,2]<-sum(mydata[,117+i]*mydata[,2])}

df$Time<-factor(df$Time, levels=df$Time) #to prevent reordering of bars

ggplot(df,aes(x=Time, y=Cumulative_Frequency), fill=df$Time)+
  geom_bar(stat="identity")+
  xlab("Time since last read a magazine")+ylab('Cumulative No. of People')+
  ggtitle('Cumulative Population Distribution \n by Magazine Last Read')+
  coord_flip()


#magazine last read
df <- data.frame(matrix(ncol = 4, nrow =6 )) 
colnames(df)<-c('Time','Cumulative_Frequency','Frequency','percentage')
df$Time<-c('1 day', '7 days','30 days','60 days','90 days','365 days')
for (i in 1:6){df[i,2]<-sum(mydata[,117+i]*mydata[,2])}
df[1,3]<-df[1,2]
for (i in 2:6){df[i,3]<-df[i,2]-df[i-1,2]}
for (i in 1:6){df[i,4]<-((df[i,3])/sum(df[,3]))*100}
df$percentage<-round(df$percentage, digits = 0)

df$Time<-factor(df$Time, levels=df$Time) #to prevent reordering of bars


df <- ddply(df, .(Time), 
            transform, pos = cumsum(Frequency) - (0.5 * Frequency))

p<-ggplot(df,aes(x=Time, y=Frequency), fill=df$Time)+
  geom_bar(stat="identity")+
  scale_x_discrete("Time since last read a magazine")+
  geom_text(data=df, aes(x=Time,y = pos, 
                         label = paste0(percentage,"%")),size=4)+
  ylab('No. of People')+
  ggtitle('Population Distribution by \n Magazine Last Read')
print(p)

The two graphs above depict the data about how often people have looked at a magazine in the past year or more. The left panel shows the cumulative population of people who have looked at a magazine in the past year or longer. It depicts that most of the population of the city has looked at magazines in the past year, but there are a few who have not looked at one in over a year (compare the ‘365 days’ bar with the ‘365+ days’ bar). The latter have not been included in the cumulative population total calculated in the graph. The right panel shows that if a person has read a magazine in the last year, it is most likely (36% chance) that he has looked at one in the past week. The chances that he has looked at one the past day or in the past month are also very high (27% and 19% respectively). So magazines seem to be reaching most of the population of Toronto quite frequently.

par(mfrow=c(1,2))

# Last read by Age Group bar graph
mag<-mydata[,c(100,118:125)]
magsum<-aggregate(mag,by=list(mag$Age),sum)
magsum<-magsum[,c(1,3:10)]
names(magsum)<-c("Age",'1 day', '7 days','30 days',
                 '60 days','90 days','365 days','365+ days','Not Stated')

magsum.long<-melt(magsum, id.vars="Age")


ggplot(magsum.long,aes(x=variable,y=value,fill=factor(Age)))+
  geom_bar(stat="identity")+
  scale_fill_discrete(name="Age Groups",
                      breaks=c(1,2,3,4,5,6),
                      labels=c('12-17','18-24','25-34','35-49','50-64','65+'))+
  coord_flip()+
  xlab("Days since last read a magazine")+ylab("No. of People")+
  ggtitle("Magazine Readership by Age")

# last read with education bar graph
mag<-mydata[,c(101,123)]
magsum<-aggregate(mag,by=list(mag$Education),sum)
magsum<-magsum[,c(1,3)]
names(magsum)<-c("Education",'365 days')

magsum.long<-melt(magsum, id.vars="Education")


ggplot(magsum.long,aes(x=variable,y=value,fill=factor(Education)))+
  geom_bar(stat="identity", position="dodge")+
  scale_fill_discrete(name="Education Level",
                      breaks=c(1,2,3,4,5,6),
                      labels=c("No Cert","Secondary School Grad",
                               "Certificate/Diploma","University Cert",
                               "Bachelors Degree","Post Grad Degree"))+
  xlab("Education Level")+ylab("No. of People")+
  ggtitle("Number of people who read a magazine
          \n in the last year by education level")+
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank())+
  ggtitle("Magazine Readership \n by Education")

In the two graphs above, the left panel above shows a graph of the distribution of people who have looked at a magazine by age groups. The 50-64 age group has contributed most to magazines read over all time ranges. This may be a direct effect of the population distribution (most people lie in the 50-64 age group and least people lie in the 12-17 age group). The right panel shows a graph of the distribution of people who have looked at a magazine by education level (only considering people who have looked at a magazine in the past year). The graph shows that those with a certification or a higher level of education are more likely to have looked at a magazine in the past year. This may be directly resulting from the fact that overall there are more people in Toronto with at least a certification or more education.

# facet graphs for newspaper sections
df<-data.frame(matrix(ncol=3,nrow=60))
colnames(df)<-c('Freq','News_Type','Total_People')
df$Freq<-rep(c('usually','sometimes','seldom','never'), each=15)
df$News_Type<-rep(c('local','national','world','sports','finance',
                      'entertainment','editorials','food','fashion','travel','automotive',
                      'comics','real estate','health','puzzles'),4)

for (i in 1:15){df[i,3]<-sum(mydata[,125+i])}
for (i in 1:15){df[i+15,3]<-sum(mydata[,140+i])}
for (i in 1:15){df[i+30,3]<-sum(mydata[,155+i])}
for (i in 1:15){df[i+45,3]<-sum(mydata[,170+i])}


p<-ggplot(df, aes(x=News_Type, y=Total_People,fill=factor(News_Type)))+
  geom_bar(stat = 'identity')+facet_wrap(~ Freq, ncol=2)+
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank())+
  scale_fill_discrete(name="News Type")+
  xlab("News Section")+ylab("No. of People")
  #ggtitle("Newspaper Popularity by Sections")
print(p)

The graph above shows how often each of the different sections of a newspaper are read by the surveyed sample population. It is obvious that the local, national and world news sections are most popular (they have the greatest heights in the ‘usually’ section of the graph). The automotive and comics sections seems to be the least popular as they have the greatest heights in the ‘never’ section. This can give the newspaper agencies an idea about their performance in different areas and can help them improve their overall readership and ad sales.

par(mfrow=c(1,2))

#last accessed internet non-cumulative
df <- data.frame(matrix(ncol = 4, nrow =6 )) 
colnames(df)<-c('Time','Cumulative_Frequency', 'Frequency','percentage')
df$Time<-c('1 day', '7 days','14 days','30 days','60 days','90 days')
for (i in 1:6){df[i,2]<-sum(mydata[,203+i]*mydata[,2])}
df[1,3]<-df[1,2]
for (i in 2:6){df[i,3]<-df[i,2]-df[i-1,2]}
for (i in 1:6){df[i,4]<-((df[i,3])/sum(df[,3]))*100}
df$percentage<-round(df$percentage, digits = 0)
df$Time<-factor(df$Time, levels=df$Time) #to prevent reordering of bars


df <- ddply(df, .(Time), 
            transform, pos = cumsum(Frequency) - (0.5 * Frequency))

p<-ggplot(df,aes(x=Time, y=Frequency), fill=df$Time)+
  geom_bar(stat="identity")+
  scale_x_discrete("Time since last accessed internet")+
  geom_text(data=df, aes(x=Time,y = pos, 
                         label = paste0(percentage,"%")),size=4)+
  ylab('No. of People')+ggtitle("Internet Usage")
p

#distance traveled
df <- data.frame(matrix(ncol = 2, nrow =11 )) 
colnames(df)<-c('Distance','Frequency')
df$Distance<-c('Under 15km', '15-24km','25-49km','50-74km','75-99km',
           '100-149km','150-249km','250-500km','500+km','None','Not Stated')
for (i in 1:11){df[i,2]<-sum(mydata[,211+i])}

df$Distance<-factor(df$Distance, levels=df$Distance) #to prevent reordering of bars

ggplot(df,aes(x=Distance, y=Frequency), fill=df$Distance)+
  geom_bar(stat="identity")+
  xlab("Distance Traveled in Toronto")+ylab('No. of People')+
  coord_flip()+ggtitle("Distance Traveled")

The left panel shows a graph of internet usage by the people of Toronto. It reveals that 82% of the population has accessed internet in the past 1 day, and almost all of them have accessed it in either the past day or the past week. The right panel-which shows a graph of distance travelled within Toronto over the past week-shows that people travel between 0 and 500+ km in an average week. The median distance traveled per week is around 62 km, hence about half of the population travels less than 62 km. The graph depicts that majority of the population travels less than 200 km in a week (75th percentile).

People access the internet very frequently, so internet can be a good medium to capture a greater audience. The information about distances travelled could be used for advertising; marketing agencies could try to reach the consumer ‘on the go’ by advertising in different modes of public transport like buses and trains (for people commonly taking public transport) or on roads and bill boards (for people using personal transport).

par(mfrow=c(1,2))

#no of shopping mall visits in past week
df <- data.frame(matrix(ncol = 2, nrow =8 )) 
colnames(df)<-c('Visits','Count')
df$Visits<-c('None', '1 visit','2 visits','3 visits','4 visits',
               '5 visits','6+ visits','Not Stated')
for (i in 1:8){df[i,2]<-sum(mydata[,223+i])}

df$Visits<-factor(df$Visits, levels=df$Visits) #to prevent reordering of bars


ggplot(df,aes(x=Visits, y=Count), fill=df$Visits)+
  geom_bar(stat="identity")+
  scale_x_discrete("Shopping mall visits in the past week")+
  coord_flip()+ylab('No. of People')+
  ggtitle("Mall Visits per Week")

#no of shopping mall visits in past 4 weeks
df <- data.frame(matrix(ncol = 2, nrow =8 )) 
colnames(df)<-c('Visits','Count')
df$Visits<-c('None', '1 visit','2 visits','3 visits','4 visits',
             '5 visits','6+ visits','Not Stated')
for (i in 1:8){df[i,2]<-sum(mydata[,232+i])}

df$Visits<-factor(df$Visits, levels=df$Visits) #to prevent reordering of bars

ggplot(df,aes(x=Visits, y=Count), fill=df$Visits)+
  geom_bar(stat="identity")+
  scale_x_discrete("Shopping mall visits in the past 4 weeks")+
  coord_flip()+ylab('No.of People')+
  ggtitle("Mall Visits per 4 Weeks")

The two graphs show the number of time people have visited a shopping mall over the past 1 week and 4 weeks respectively. The left panel shows that people are most likely to visit the mall 2 or less times in a week (75th percentile). Over 4 weeks a large portion of Toronto’s population visits malls either 2 or less times but a significant proportion also visits 6 times. This shows a clear divide between people who go to the mall often and those who go seldom. Different marketing offers can be designed to cater to both these market segments.

The conclusions made above are not comprehensive but only highlight certain important aspects of the test data provided. All of the graphs and other analysis of the dataset has been completed using R.