Student name: Gerelchuluun Amarsanaa Student ID: 12300075
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(psych)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
mydata <- read_csv("instagram_data.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 11692 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): owner_id, owner_username, shortcode, caption, location, imageUrl, u...
## dbl (5): comments, likes, created_at, followers, following
## lgl (2): is_video, multiple_images
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
The data has been uploaded to kaggle.com under MIT licence. Link to data: https://www.kaggle.com/datasets/propriyam/instagram-data
# Converting is_video variable into factors
mydata$videoF <- factor(mydata$is_video,
levels = c(FALSE, TRUE),
labels = c("Not_Video", "Video"))
# Converting multiple_images variable into factors
mydata$multiImageF <- factor(mydata$multiple_images,
levels = c(FALSE, TRUE),
labels = c("SingleImage", "MultipleImages"))
mydata <- mydata[, c(-5, -8, -9, -10, -12)]
head(mydata)
## # A tibble: 6 × 11
## owner_id owner_username shortcode is_video comments likes multiple_images
## <chr> <chr> <chr> <lgl> <dbl> <dbl> <lgl>
## 1 36063641 christendominique C3_GS1ASeWI FALSE 268 16382 TRUE
## 2 36063641 christendominique C38ivgNS3IX TRUE 138 9267 FALSE
## 3 36063641 christendominique C35-Dd9SO1b TRUE 1089 10100 FALSE
## 4 36063641 christendominique C33TadDMisq TRUE 271 6943 FALSE
## 5 36063641 christendominique C3s-Cm1yCba TRUE 145 17158 FALSE
## 6 36063641 christendominique C3n0pZJvG-G TRUE 143 9683 FALSE
## # ℹ 4 more variables: followers <dbl>, following <dbl>, videoF <fct>,
## # multiImageF <fct>
# Replacing non-numeric data with NA
mydata$comments <- as.numeric( mydata$comments)
mydata$likes <- as.numeric( mydata$likes )
mydata$followers <- as.numeric( mydata$followers )
mydata$following <- as.numeric( mydata$following )
# Removing NA
mydata <- na.omit(mydata)
head(mydata)
## # A tibble: 6 × 11
## owner_id owner_username shortcode is_video comments likes multiple_images
## <chr> <chr> <chr> <lgl> <dbl> <dbl> <lgl>
## 1 36063641 christendominique C3_GS1ASeWI FALSE 268 16382 TRUE
## 2 36063641 christendominique C38ivgNS3IX TRUE 138 9267 FALSE
## 3 36063641 christendominique C35-Dd9SO1b TRUE 1089 10100 FALSE
## 4 36063641 christendominique C33TadDMisq TRUE 271 6943 FALSE
## 5 36063641 christendominique C3s-Cm1yCba TRUE 145 17158 FALSE
## 6 36063641 christendominique C3n0pZJvG-G TRUE 143 9683 FALSE
## # ℹ 4 more variables: followers <dbl>, following <dbl>, videoF <fct>,
## # multiImageF <fct>
numericData = mydata[, c(-1, -2, -3, -4, -7, -8, -9)]
Code in this section aims to make a table, selenaGomezData that contains data about posts of user with “owner_username==selenagomez”.
selenaGomezData <- mydata[mydata$owner_username=="selenagomez",]
Firstly, the code in this section takes the columns that will be graphed on scatterplot and saves them in variable valuesByUsers. Then, it factors data by users. Then, number of likes is aggregated for each user, and stored in likesByUsers table. Number of followers and followings for each users stored in followersByUsers and followingByUsers tables. Then, the three tables are joined by owner_id variable.
valuesByUsers <- select(mydata, c("owner_id", "likes", "followers", "following"))
valuesByUsers$usersF <- factor(valuesByUsers$owner_id,
levels = unique(valuesByUsers$owner_id),
labels = unique(valuesByUsers$owner_id)
)
head(valuesByUsers)
## # A tibble: 6 × 5
## owner_id likes followers following usersF
## <chr> <dbl> <dbl> <dbl> <fct>
## 1 36063641 16382 2144626 1021 36063641
## 2 36063641 9267 2144626 1021 36063641
## 3 36063641 10100 2144626 1021 36063641
## 4 36063641 6943 2144626 1021 36063641
## 5 36063641 17158 2144626 1021 36063641
## 6 36063641 9683 2144626 1021 36063641
likesByUsers <- valuesByUsers %>%
group_by(usersF) %>%
summarize(total_likes = sum(likes))
followersByUsers <- valuesByUsers[, c("usersF", "followers")] %>%
group_by(usersF) %>%
summarize(followers = first(followers))
followingByUsers <- valuesByUsers[, c("usersF", "following")] %>%
group_by(usersF) %>%
summarize(followings = first(following))
scatterplotData <- merge( merge(likesByUsers, followersByUsers, by="usersF"), followingByUsers, by="usersF" )
# Removing outlier
scatterplotData <- scatterplotData[scatterplotData$usersF != 460563723, ]
head(scatterplotData)
## usersF total_likes followers followings
## 1 1000542340 375 148480 1459
## 2 10025623 30901 327698 3086
## 3 1010372386 4998 59092 400
## 4 1014516 277255 1358035 873
## 5 10245870 7227812 22489144 751
## 6 10274566 44260 437885 1433
describeBy(numericData, numericData$videoF)
##
## Descriptive statistics by group
## group: Not_Video
## vars n mean sd median trimmed mad min max
## comments 1 6135 425.58 3932.9 35 66.26 47.44 0 122771
## likes 2 6135 33284.86 291908.9 2306 5526.78 3183.14 0 12685879
## videoF 3 6135 1.00 0.0 1 1.00 0.00 1 1
## multiImageF 4 6135 1.54 0.5 2 1.55 0.00 1 2
## range skew kurtosis se
## comments 122771 20.91 505.53 50.21
## likes 12685879 27.89 960.99 3726.83
## videoF 0 NaN NaN 0.00
## multiImageF 1 -0.16 -1.97 0.01
## ------------------------------------------------------------
## group: Video
## vars n mean sd median trimmed mad min max
## comments 1 3442 412.19 2721.94 47 88.94 60.79 0 71049
## likes 2 3442 28868.86 165479.64 2084 5457.16 2757.64 0 3764323
## videoF 3 3442 2.00 0.00 2 2.00 0.00 2 2
## multiImageF 4 3442 1.00 0.00 1 1.00 0.00 1 1
## range skew kurtosis se
## comments 71049 18.44 416.41 46.40
## likes 3764323 14.07 249.63 2820.58
## videoF 0 NaN NaN 0.00
## multiImageF 0 NaN NaN 0.00
describeBy(numericData, numericData$multiImageF)
##
## Descriptive statistics by group
## group: SingleImage
## vars n mean sd median trimmed mad min max
## comments 1 6264 397.03 3616.12 32.0 66.30 44.48 0 122771
## likes 2 6264 23596.76 151259.01 1703.5 4218.10 2286.17 0 4414844
## videoF 3 6264 1.55 0.50 2.0 1.56 0.00 1 2
## multiImageF 4 6264 1.00 0.00 1.0 1.00 0.00 1 1
## range skew kurtosis se
## comments 122771 22.60 600.55 45.69
## likes 4414844 16.63 349.57 1911.15
## videoF 1 -0.20 -1.96 0.01
## multiImageF 0 NaN NaN 0.00
## ------------------------------------------------------------
## group: MultipleImages
## vars n mean sd median trimmed mad min max
## comments 1 3313 465.64 3407.96 56 87.64 71.16 0 90183
## likes 2 3313 47014.51 377696.21 3748 8253.31 5083.84 0 12685879
## videoF 3 3313 1.00 0.00 1 1.00 0.00 1 1
## multiImageF 4 3313 2.00 0.00 2 2.00 0.00 2 2
## range skew kurtosis se
## comments 90183 18.25 398.69 59.21
## likes 12685879 22.96 623.25 6561.93
## videoF 0 NaN NaN 0.00
## multiImageF 0 NaN NaN 0.00
ggplot(selenaGomezData, aes(x = likes)) +
geom_histogram(breaks = seq(2000000, 13000000, by = 1000000), fill="blue", color="black") +
facet_wrap(selenaGomezData$videoF, ncol=1) +
labs(title = "Histogram 1: Selena Gomez likes on posts by Video Factor",
x = "Number of Likes (by 1000000)",
y = "Frequency") +
theme_minimal()
Histogram 1 shows number of likes for user “selenagomez” by Video Factor. It is clear that non-video contents receive higher likes, while video contents receive less than 5 million likes. Furthermore, it shows that the user posts non-video contents more often in this dataset. ___________________________________________
ggplot(selenaGomezData, aes(x = likes)) +
geom_histogram(breaks = seq(2000000, 13000000, by = 1000000), fill="red", color="black") +
facet_wrap(selenaGomezData$multiple_images, ncol=1) +
labs(title = "Histogram 2: Selena Gomez likes on posts by Multiple images Factor",
x = "Number of Likes (by 1000000)",
y = "Frequency") +
theme_minimal()
Histogram 2 shows number of likes for user “selenagomez” by Multiple Images Factor. It is clear that multiple image contents receive higher likes, and the user posts multiple image contents more often in this dataset.
scatterplot(y=scatterplotData$total_likes,
x=scatterplotData$followers,
xlab="Users' total likes",
ylab="Users' followers",
main="Plot 1: Correlation between users total likes and followers",
smooth=FALSE)
The Plot 1 shows the correlation between total number of likes and followers for users. It is clear that there is a positive correlation, meaning users with higher number of likes also have higher number of followers.
scatterplot(x=scatterplotData$total_likes,
y=scatterplotData$followings,
xlab="Users' total likes",
ylab="Users' followings",
col="red",
main="Plot 2: Correlation between users total likes and followings",
smooth=FALSE)
The Plot 2 shows the correlation between total number of likes and followings for users. There is no correlation, meaning there is no relationship between the number of likes and the number of people that users follow.
scatterplot(x=scatterplotData$followings,
y=scatterplotData$followers,
xlab="Users' followings",
ylab="Users' followers",
col="black",
main="Plot 3: Correlation between users followers and followings",
smooth=FALSE)
The Plot 3 shows the correlation between total number of followers and followings for users. There is no correlation, meaning there is no relationship between the number of followers that follows the user and the number of people that users follow.