Social Media Insights: Exploring User Behavior and Trends

Adding Library and Summary and Analysis of Social Media such as Mean, Max, Sum

library(readr)
data <- read_csv("C:/Users/bhaga/OneDrive/Desktop/new dataset/social media_new.csv")

## Rows: 1628 Columns: 40
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): City, Current Status, Do you own multiple profiles on Instagram?, ...
## dbl (11): Age, Latitude, Longitude, How many subscriber do you have on youtu...
## num (14): How many followers do you have on Instagram?, How many posts do yo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

View(data)

colnames(data)

##  [1] "Age"                                                 
##  [2] "City"                                                
##  [3] "Current Status"                                      
##  [4] "Do you own multiple profiles on Instagram?"          
##  [5] "Gender"                                              
##  [6] "Highest Education"                                   
##  [7] "Location (City Airport Code)"                        
##  [8] "Phone OS"                                            
##  [9] "State"                                               
## [10] "Zone"                                                
## [11] "How many followers do you have on Instagram?"        
## [12] "How many posts do you have on Instagram?"            
## [13] "Latitude"                                            
## [14] "Longitude"                                           
## [15] "Time Spent on Facebook in last week (in minutes)"    
## [16] "Time Spent on Facebook in last weekend (in minutes)" 
## [17] "Time Spent on Instagram in last week (in minutes)"   
## [18] "Time Spent on Instagram in last weekend (in minutes)"
## [19] "Time Spent on WhatsApp in last week (in minutes)"    
## [20] "Time Spent on WhatsApp in last weekend (in minutes)" 
## [21] "Total Facebook Usage"                                
## [22] "Total Instagram Usage"                               
## [23] "Total Social Media Usage"                            
## [24] "Total Week Usage"                                    
## [25] "Total Weekend Usage"                                 
## [26] "Total WhatsApp Usage"                                
## [27] "How many subscriber do you have on youtube"          
## [28] "Income from YouTube (rs,month)"                      
## [29] "Internet Speed (Mbps)"                               
## [30] "Total YouTube Usage (minutes)"                       
## [31] "Weekly YouTube Usage (minutes)"                      
## [32] "Profession/Activity"                                 
## [33] "Likes on Instagram"                                  
## [34] "Shares on Instagram"                                 
## [35] "Hobby"                                               
## [36] "Verified Account on instagram"                       
## [37] "Date of Birth"                                       
## [38] "Time Spent on Twitter per Week (minutes)"            
## [39] "Total Time Spent on Twitter (minutes)"               
## [40] "Groups Joined on Instagram"

str(data)

## spc_tbl_ [1,628 × 40] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Age                                                 : num [1:1628] 24 39 22 26 50 25 52 45 25 27 ...
##  $ City                                                : chr [1:1628] "Delhi" "Delhi" "Mumbai" "Bengaluru" ...
##  $ Current Status                                      : chr [1:1628] "Working professional" "Working professional" "Working professional" "Sabbatical" ...
##  $ Do you own multiple profiles on Instagram?          : chr [1:1628] "No" "No" "No" "Yes" ...
##  $ Gender                                              : chr [1:1628] "Female" "Female" "Male" "Female" ...
##  $ Highest Education                                   : chr [1:1628] "Graduation" "Post graduation" "Graduation" "Graduation" ...
##  $ Location (City Airport Code)                        : chr [1:1628] "DEL" "DEL" "BOM" "BLR" ...
##  $ Phone OS                                            : chr [1:1628] "iOs" "iOs" "Android" "Android" ...
##  $ State                                               : chr [1:1628] "Delhi" "Delhi" "Maharashtra" "Karnataka" ...
##  $ Zone                                                : chr [1:1628] "Northern" "Northern" "Western" "Southern" ...
##  $ How many followers do you have on Instagram?        : num [1:1628] 456 0 400 485 0 ...
##  $ How many posts do you have on Instagram?            : num [1:1628] 20 0 6 16 0 220 0 0 340 37 ...
##  $ Latitude                                            : num [1:1628] 28.7 28.7 19 13 28.7 ...
##  $ Longitude                                           : num [1:1628] 77.2 77.2 72.8 77.6 77.2 ...
##  $ Time Spent on Facebook in last week (in minutes)    : num [1:1628] 0 6000 500 1500 1500 1000 300 983 1160 480 ...
##  $ Time Spent on Facebook in last weekend (in minutes) : num [1:1628] 0 2160 2000 1500 1500 1200 900 873 870 840 ...
##  $ Time Spent on Instagram in last week (in minutes)   : num [1:1628] 770 0 1000 2000 0 3000 0 0 1240 720 ...
##  $ Time Spent on Instagram in last weekend (in minutes): num [1:1628] 400 0 1000 2000 0 840 215 0 340 300 ...
##  $ Time Spent on WhatsApp in last week (in minutes)    : num [1:1628] 900 5000 7000 1680 2400 2100 1800 583 1760 3000 ...
##  $ Time Spent on WhatsApp in last weekend (in minutes) : num [1:1628] 120 2000 2000 1680 1300 600 1500 834 450 600 ...
##  $ Total Facebook Usage                                : num [1:1628] 0 8160 2500 3000 3000 ...
##  $ Total Instagram Usage                               : num [1:1628] 1170 0 2000 4000 0 3840 215 0 1580 1020 ...
##  $ Total Social Media Usage                            : num [1:1628] 2190 15160 13500 10360 6700 ...
##  $ Total Week Usage                                    : num [1:1628] 1670 11000 8500 5180 3900 ...
##  $ Total Weekend Usage                                 : num [1:1628] 520 4160 5000 5180 2800 ...
##  $ Total WhatsApp Usage                                : num [1:1628] 1020 7000 9000 3360 3700 ...
##  $ How many subscriber do you have on youtube          : num [1:1628] 33356 25394 34603 13645 49876 ...
##  $ Income from YouTube (rs,month)                      : num [1:1628] 88447 64764 4387 99695 81297 ...
##  $ Internet Speed (Mbps)                               : num [1:1628] 46.6 83.5 50.5 99.6 22.1 ...
##  $ Total YouTube Usage (minutes)                       : num [1:1628] 272543 220056 2629 154271 178485 ...
##  $ Weekly YouTube Usage (minutes)                      : num [1:1628] 305 2090 528 1545 2836 ...
##  $ Profession/Activity                                 : chr [1:1628] "Photographer" "Singer" "Content Creator" "Blogger" ...
##  $ Likes on Instagram                                  : chr [1:1628] "679k" "400k" "447k" "443k" ...
##  $ Shares on Instagram                                 : chr [1:1628] "16K" "28K" "86K" "39K" ...
##  $ Hobby                                               : chr [1:1628] "Reading" "Reading" "Playing Musical Instruments" "Cooking/Baking" ...
##  $ Verified Account on instagram                       : chr [1:1628] "Yes" "Yes" "No" "No" ...
##  $ Date of Birth                                       : chr [1:1628] "8/31/1982" "12/17/1992" "4/16/2002" "12/10/1965" ...
##  $ Time Spent on Twitter per Week (minutes)            : num [1:1628] 356 1774 1848 199 1210 ...
##  $ Total Time Spent on Twitter (minutes)               : num [1:1628] 2492 12418 12936 1393 8470 ...
##  $ Groups Joined on Instagram                          : num [1:1628] 4 5 2 4 3 9 2 3 2 6 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Age = col_double(),
##   ..   City = col_character(),
##   ..   `Current Status` = col_character(),
##   ..   `Do you own multiple profiles on Instagram?` = col_character(),
##   ..   Gender = col_character(),
##   ..   `Highest Education` = col_character(),
##   ..   `Location (City Airport Code)` = col_character(),
##   ..   `Phone OS` = col_character(),
##   ..   State = col_character(),
##   ..   Zone = col_character(),
##   ..   `How many followers do you have on Instagram?` = col_number(),
##   ..   `How many posts do you have on Instagram?` = col_number(),
##   ..   Latitude = col_double(),
##   ..   Longitude = col_double(),
##   ..   `Time Spent on Facebook in last week (in minutes)` = col_number(),
##   ..   `Time Spent on Facebook in last weekend (in minutes)` = col_number(),
##   ..   `Time Spent on Instagram in last week (in minutes)` = col_number(),
##   ..   `Time Spent on Instagram in last weekend (in minutes)` = col_number(),
##   ..   `Time Spent on WhatsApp in last week (in minutes)` = col_number(),
##   ..   `Time Spent on WhatsApp in last weekend (in minutes)` = col_number(),
##   ..   `Total Facebook Usage` = col_number(),
##   ..   `Total Instagram Usage` = col_number(),
##   ..   `Total Social Media Usage` = col_number(),
##   ..   `Total Week Usage` = col_number(),
##   ..   `Total Weekend Usage` = col_number(),
##   ..   `Total WhatsApp Usage` = col_number(),
##   ..   `How many subscriber do you have on youtube` = col_double(),
##   ..   `Income from YouTube (rs,month)` = col_double(),
##   ..   `Internet Speed (Mbps)` = col_double(),
##   ..   `Total YouTube Usage (minutes)` = col_double(),
##   ..   `Weekly YouTube Usage (minutes)` = col_double(),
##   ..   `Profession/Activity` = col_character(),
##   ..   `Likes on Instagram` = col_character(),
##   ..   `Shares on Instagram` = col_character(),
##   ..   Hobby = col_character(),
##   ..   `Verified Account on instagram` = col_character(),
##   ..   `Date of Birth` = col_character(),
##   ..   `Time Spent on Twitter per Week (minutes)` = col_double(),
##   ..   `Total Time Spent on Twitter (minutes)` = col_double(),
##   ..   `Groups Joined on Instagram` = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

summary(data)

##       Age            City           Current Status    
##  Min.   :13.00   Length:1628        Length:1628       
##  1st Qu.:22.00   Class :character   Class :character  
##  Median :24.00   Mode  :character   Mode  :character  
##  Mean   :26.86                                        
##  3rd Qu.:27.00                                        
##  Max.   :74.00                                        
##  Do you own multiple profiles on Instagram?    Gender         
##  Length:1628                                Length:1628       
##  Class :character                           Class :character  
##  Mode  :character                           Mode  :character  
##                                                               
##                                                               
##                                                               
##  Highest Education  Location (City Airport Code)   Phone OS        
##  Length:1628        Length:1628                  Length:1628       
##  Class :character   Class :character             Class :character  
##  Mode  :character   Mode  :character             Mode  :character  
##                                                                    
##                                                                    
##                                                                    
##     State               Zone          
##  Length:1628        Length:1628       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
##  How many followers do you have on Instagram?
##  Min.   :     0.0                            
##  1st Qu.:   183.0                            
##  Median :   370.0                            
##  Mean   :   868.1                            
##  3rd Qu.:   657.0                            
##  Max.   :116000.0                            
##  How many posts do you have on Instagram?    Latitude        Longitude    
##  Min.   :   0.00                          Min.   : 8.486   Min.   :69.67  
##  1st Qu.:  10.00                          1st Qu.:18.988   1st Qu.:72.84  
##  Median :  43.50                          Median :22.563   Median :77.23  
##  Mean   :  99.08                          Mean   :22.760   Mean   :77.89  
##  3rd Qu.: 111.25                          3rd Qu.:28.652   3rd Qu.:78.46  
##  Max.   :2858.00                          Max.   :32.736   Max.   :94.91  
##  Time Spent on Facebook in last week (in minutes)
##  Min.   :   0.0                                  
##  1st Qu.:   2.0                                  
##  Median :  63.0                                  
##  Mean   : 175.2                                  
##  3rd Qu.: 240.0                                  
##  Max.   :6000.0                                  
##  Time Spent on Facebook in last weekend (in minutes)
##  Min.   :   0.00                                    
##  1st Qu.:   0.00                                    
##  Median :  30.00                                    
##  Mean   :  75.69                                    
##  3rd Qu.:  89.00                                    
##  Max.   :2160.00                                    
##  Time Spent on Instagram in last week (in minutes)
##  Min.   :   0.0                                   
##  1st Qu.: 120.0                                   
##  Median : 357.0                                   
##  Mean   : 505.2                                   
##  3rd Qu.: 675.0                                   
##  Max.   :6000.0                                   
##  Time Spent on Instagram in last weekend (in minutes)
##  Min.   :   0.0                                      
##  1st Qu.:  48.0                                      
##  Median : 135.0                                      
##  Mean   : 215.0                                      
##  3rd Qu.: 281.5                                      
##  Max.   :2560.0                                      
##  Time Spent on WhatsApp in last week (in minutes)
##  Min.   :   4.0                                  
##  1st Qu.: 300.0                                  
##  Median : 600.0                                  
##  Mean   : 854.9                                  
##  3rd Qu.:1009.0                                  
##  Max.   :7000.0                                  
##  Time Spent on WhatsApp in last weekend (in minutes) Total Facebook Usage
##  Min.   :   0.0                                      Min.   :   0.0      
##  1st Qu.: 100.0                                      1st Qu.:  10.0      
##  Median : 200.0                                      Median : 101.5      
##  Mean   : 294.9                                      Mean   : 250.9      
##  3rd Qu.: 360.0                                      3rd Qu.: 334.2      
##  Max.   :2800.0                                      Max.   :8160.0      
##  Total Instagram Usage Total Social Media Usage Total Week Usage
##  Min.   :   0.0        Min.   :   12            Min.   :    8   
##  1st Qu.: 190.8        1st Qu.:  970            1st Qu.:  670   
##  Median : 522.5        Median : 1658            Median : 1170   
##  Mean   : 720.2        Mean   : 2121            Mean   : 1535   
##  3rd Qu.: 970.0        3rd Qu.: 2670            3rd Qu.: 1895   
##  Max.   :8240.0        Max.   :15780            Max.   :12734   
##  Total Weekend Usage Total WhatsApp Usage
##  Min.   :   0.0      Min.   :   9        
##  1st Qu.: 243.0      1st Qu.: 450        
##  Median : 425.5      Median : 812        
##  Mean   : 585.6      Mean   :1150        
##  3rd Qu.: 709.0      3rd Qu.:1400        
##  Max.   :5180.0      Max.   :9000        
##  How many subscriber do you have on youtube Income from YouTube (rs,month)
##  Min.   :   33                              Min.   :   11                 
##  1st Qu.:12783                              1st Qu.:23870                 
##  Median :24629                              Median :47898                 
##  Mean   :24795                              Mean   :49166                 
##  3rd Qu.:36702                              3rd Qu.:74162                 
##  Max.   :49939                              Max.   :99991                 
##  Internet Speed (Mbps) Total YouTube Usage (minutes)
##  Min.   : 2.03         Min.   :   224               
##  1st Qu.:24.82         1st Qu.: 78476               
##  Median :50.47         Median :153087               
##  Mean   :50.34         Mean   :150846               
##  3rd Qu.:75.83         3rd Qu.:225062               
##  Max.   :99.93         Max.   :299562               
##  Weekly YouTube Usage (minutes) Profession/Activity Likes on Instagram
##  Min.   :   2                   Length:1628         Length:1628       
##  1st Qu.: 782                   Class :character    Class :character  
##  Median :1456                   Mode  :character    Mode  :character  
##  Mean   :1480                                                         
##  3rd Qu.:2226                                                         
##  Max.   :2998                                                         
##  Shares on Instagram    Hobby           Verified Account on instagram
##  Length:1628         Length:1628        Length:1628                  
##  Class :character    Class :character   Class :character             
##  Mode  :character    Mode  :character   Mode  :character             
##                                                                      
##                                                                      
##                                                                      
##  Date of Birth      Time Spent on Twitter per Week (minutes)
##  Length:1628        Min.   :   0                            
##  Class :character   1st Qu.: 561                            
##  Mode  :character   Median :1168                            
##                     Mean   :1179                            
##                     3rd Qu.:1790                            
##                     Max.   :2399                            
##  Total Time Spent on Twitter (minutes) Groups Joined on Instagram
##  Min.   :    0                         Min.   : 0.00             
##  1st Qu.: 3927                         1st Qu.: 2.00             
##  Median : 8180                         Median : 5.00             
##  Mean   : 8255                         Mean   : 4.96             
##  3rd Qu.:12532                         3rd Qu.: 8.00             
##  Max.   :16793                         Max.   :10.00

head(data)

## # A tibble: 6 × 40
##     Age City  `Current Status` Do you own multiple …¹ Gender `Highest Education`
##   <dbl> <chr> <chr>            <chr>                  <chr>  <chr>              
## 1    24 Delhi Working profess… No                     Female Graduation         
## 2    39 Delhi Working profess… No                     Female Post graduation    
## 3    22 Mumb… Working profess… No                     Male   Graduation         
## 4    26 Beng… Sabbatical       Yes                    Female Graduation         
## 5    50 Delhi Working profess… No                     Male   Graduation         
## 6    25 Vish… Working profess… Yes                    Female Post graduation    
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 34 more variables: `Location (City Airport Code)` <chr>, `Phone OS` <chr>,
## #   State <chr>, Zone <chr>,
## #   `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, `Time Spent on Facebook in last week (in minutes)` <dbl>,
## #   `Time Spent on Facebook in last weekend (in minutes)` <dbl>, …

# Create a matrix with the first 3 rows and 3 columns of the data
matrix_data <- matrix(data[1:3, 1:3], nrow = 3, ncol = 3, byrow = TRUE)

# Print the matrix
matrix_data

##      [,1]      [,2]        [,3]       
## [1,] numeric,3 character,3 character,3
## [2,] numeric,3 character,3 character,3
## [3,] numeric,3 character,3 character,3

#Average age
mean(data$Age)

## [1] 26.85811

# Median age
median(data$Age)

## [1] 24

# Maximum age
max(data$Age)

## [1] 74

# Minimum age
min(data$Age)

## [1] 13

# Total number of males
sum(data$Gender == "Male")

## [1] 813

# Total number of females
sum(data$Gender == "Female")

## [1] 813

# Average number of followers on Instagram
mean(data$`How many followers do you have on Instagram?`)

## [1] 868.1474

# Median number of followers on Instagram
median(data$`How many followers do you have on Instagram?`)

## [1] 370

# Maximum number of followers on Instagram
max(data$`How many followers do you have on Instagram?`)

## [1] 116000

#  Minimum number of followers on Instagram
min(data$`How many followers do you have on Instagram?`)

## [1] 0

#  Average number of posts on Instagram
mean(data$`How many posts do you have on Instagram?`)

## [1] 99.07985

#  Median number of posts on Instagram
median(data$`How many posts do you have on Instagram?`)

## [1] 43.5

#  Maximum number of posts on Instagram
max(data$`How many posts do you have on Instagram?`)

## [1] 2858

#  Minimum number of posts on Instagram
min(data$`How many posts do you have on Instagram?`)

## [1] 0

#  Total time spent on Facebook in last week
sum(data$`Time Spent on Facebook in last week (in minutes)`)

## [1] 285275

#  Total time spent on Instagram in last week
sum(data$`Time Spent on Instagram in last week (in minutes)`)

## [1] 822407

# Total time spent on WhatsApp in last week
sum(data$`Time Spent on WhatsApp in last week (in minutes)`)

## [1] 1391726

# Average total social media usage
mean(data$`Total Social Media Usage`)

## [1] 2120.885

# Median total social media usage
median(data$`Total Social Media Usage`)

## [1] 1658.5

# Maximum total social media usage
max(data$`Total Social Media Usage`)

## [1] 15780

Question : What are the different ways in which the data has been filtered based on various conditions such as gender, education, location, phone OS, and social media usage?”

# Only females
females <- subset(data, Gender == "Female")
females

## # A tibble: 813 × 40
##      Age City           `Current Status`     Do you own multiple profil…¹ Gender
##    <dbl> <chr>          <chr>                <chr>                        <chr> 
##  1    24 Delhi          Working professional No                           Female
##  2    39 Delhi          Working professional No                           Female
##  3    26 Bengaluru      Sabbatical           Yes                          Female
##  4    25 Vishakhapatnam Working professional Yes                          Female
##  5    45 Durgapur       Sabbatical           No                           Female
##  6    45 Delhi          Working professional No                           Female
##  7    21 Delhi          Working professional No                           Female
##  8    26 Delhi          Working professional No                           Female
##  9    25 Mumbai         Sabbatical           No                           Female
## 10    22 Kolkata        Student              No                           Female
## # ℹ 803 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

# Only males
males <- subset(data, Gender == "Male")
males

## # A tibble: 813 × 40
##      Age City      `Current Status`     Do you own multiple profiles on…¹ Gender
##    <dbl> <chr>     <chr>                <chr>                             <chr> 
##  1    22 Mumbai    Working professional No                                Male  
##  2    50 Delhi     Working professional No                                Male  
##  3    52 Jaipur    Working professional No                                Male  
##  4    25 Bengaluru Student              No                                Male  
##  5    27 Delhi     Student              Yes                               Male  
##  6    27 Bengaluru Working professional No                                Male  
##  7    22 Delhi     Sabbatical           Yes                               Male  
##  8    26 Agra      Working professional No                                Male  
##  9    25 Ahmedabad Student              No                                Male  
## 10    18 Jaipur    Student              No                                Male  
## # ℹ 803 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

# Only students
students <- subset(data, `Current Status` == "Student")
students

## # A tibble: 637 × 40
##      Age City      `Current Status` Do you own multiple profiles on Ins…¹ Gender
##    <dbl> <chr>     <chr>            <chr>                                 <chr> 
##  1    25 Bengaluru Student          No                                    Male  
##  2    27 Delhi     Student          Yes                                   Male  
##  3    25 Ahmedabad Student          No                                    Male  
##  4    18 Jaipur    Student          No                                    Male  
##  5    22 Kolkata   Student          No                                    Female
##  6    26 Kolkata   Student          No                                    Female
##  7    23 Delhi     Student          No                                    Female
##  8    17 Mumbai    Student          Yes                                   Male  
##  9    23 Kolkata   Student          Yes                                   Female
## 10    22 Ahmedabad Student          No                                    Male  
## # ℹ 627 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

# Only working professionals
working_professionals <- subset(data, `Current Status` == "Working professional")
working_professionals

## # A tibble: 796 × 40
##      Age City           `Current Status`     Do you own multiple profil…¹ Gender
##    <dbl> <chr>          <chr>                <chr>                        <chr> 
##  1    24 Delhi          Working professional No                           Female
##  2    39 Delhi          Working professional No                           Female
##  3    22 Mumbai         Working professional No                           Male  
##  4    50 Delhi          Working professional No                           Male  
##  5    25 Vishakhapatnam Working professional Yes                          Female
##  6    52 Jaipur         Working professional No                           Male  
##  7    27 Bengaluru      Working professional No                           Male  
##  8    45 Delhi          Working professional No                           Female
##  9    21 Delhi          Working professional No                           Female
## 10    26 Agra           Working professional No                           Male  
## # ℹ 786 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

# Only people from Northern zone
northern <- subset(data, Zone == "Northern")
northern

## # A tibble: 542 × 40
##      Age City   `Current Status`     Do you own multiple profiles on In…¹ Gender
##    <dbl> <chr>  <chr>                <chr>                                <chr> 
##  1    24 Delhi  Working professional No                                   Female
##  2    39 Delhi  Working professional No                                   Female
##  3    50 Delhi  Working professional No                                   Male  
##  4    52 Jaipur Working professional No                                   Male  
##  5    27 Delhi  Student              Yes                                  Male  
##  6    45 Delhi  Working professional No                                   Female
##  7    22 Delhi  Sabbatical           Yes                                  Male  
##  8    21 Delhi  Working professional No                                   Female
##  9    26 Agra   Working professional No                                   Male  
## 10    26 Delhi  Working professional No                                   Female
## # ℹ 532 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

# Only people from Southern zone
southern <- subset(data, Zone == "Southern")
southern

## # A tibble: 211 × 40
##      Age City           `Current Status`     Do you own multiple profil…¹ Gender
##    <dbl> <chr>          <chr>                <chr>                        <chr> 
##  1    26 Bengaluru      Sabbatical           Yes                          Female
##  2    25 Vishakhapatnam Working professional Yes                          Female
##  3    25 Bengaluru      Student              No                           Male  
##  4    27 Bengaluru      Working professional No                           Male  
##  5    32 Bengaluru      Working professional No                           Male  
##  6    27 Chennai        Working professional No                           Male  
##  7    23 Chennai        Student              No                           Female
##  8    22 Chennai        Student              Yes                          Female
##  9    32 Bengaluru      Working professional No                           Female
## 10    21 Chennai        Student              No                           Female
## # ℹ 201 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

#  Only people from Eastern zone
eastern <- subset(data, Zone == "Eastern")
eastern

## # A tibble: 271 × 40
##      Age City        `Current Status`     Do you own multiple profiles …¹ Gender
##    <dbl> <chr>       <chr>                <chr>                           <chr> 
##  1    45 Durgapur    Sabbatical           No                              Female
##  2    24 Cooch-behar Working professional No                              Male  
##  3    22 Kolkata     Student              No                              Female
##  4    26 Kolkata     Student              No                              Female
##  5    50 Kolkata     Working professional No                              Female
##  6    23 Kolkata     Student              Yes                             Female
##  7    25 Kolkata     Working professional No                              Male  
##  8    45 Bagdogra    Working professional Yes                             Female
##  9    25 Kolkata     Student              No                              Male  
## 10    45 Kolkata     Working professional No                              Female
## # ℹ 261 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

# Only people from Western zone
western <- subset(data, Zone == "Western")
western

## # A tibble: 543 × 40
##      Age City      `Current Status`     Do you own multiple profiles on…¹ Gender
##    <dbl> <chr>     <chr>                <chr>                             <chr> 
##  1    22 Mumbai    Working professional No                                Male  
##  2    25 Ahmedabad Student              No                                Male  
##  3    25 Mumbai    Sabbatical           No                                Female
##  4    25 Ahmedabad Self Employed        Yes                               Male  
##  5    17 Mumbai    Student              Yes                               Male  
##  6    22 Ahmedabad Student              No                                Male  
##  7    24 Pune      Working professional No                                Female
##  8    21 Mumbai    Sabbatical           Yes                               Female
##  9    51 Ahmedabad Working professional No                                Male  
## 10    22 Mumbai    Sabbatical           No                                Male  
## # ℹ 533 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

#Only people with Android phone OS
android <- subset(data, `Phone OS` == "Android")
android

## # A tibble: 1,115 × 40
##      Age City           `Current Status`     Do you own multiple profil…¹ Gender
##    <dbl> <chr>          <chr>                <chr>                        <chr> 
##  1    22 Mumbai         Working professional No                           Male  
##  2    26 Bengaluru      Sabbatical           Yes                          Female
##  3    25 Vishakhapatnam Working professional Yes                          Female
##  4    52 Jaipur         Working professional No                           Male  
##  5    45 Durgapur       Sabbatical           No                           Female
##  6    25 Bengaluru      Student              No                           Male  
##  7    27 Delhi          Student              Yes                          Male  
##  8    27 Bengaluru      Working professional No                           Male  
##  9    21 Delhi          Working professional No                           Female
## 10    26 Agra           Working professional No                           Male  
## # ℹ 1,105 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

# Only people with iOs phone OS
ios <- subset(data, `Phone OS` == "iOs")
ios

## # A tibble: 508 × 40
##      Age City     `Current Status`     Do you own multiple profiles on …¹ Gender
##    <dbl> <chr>    <chr>                <chr>                              <chr> 
##  1    24 Delhi    Working professional No                                 Female
##  2    39 Delhi    Working professional No                                 Female
##  3    50 Delhi    Working professional No                                 Male  
##  4    45 Delhi    Working professional No                                 Female
##  5    22 Delhi    Sabbatical           Yes                                Male  
##  6    18 Jaipur   Student              No                                 Male  
##  7    22 Chennai  Student              Yes                                Female
##  8    23 Guwahati Student              No                                 Female
##  9    45 Bagdogra Working professional Yes                                Female
## 10    28 Kolkata  Sabbatical           No                                 Female
## # ℹ 498 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

#Only people with Graduation education
graduation <- subset(data, `Highest Education` == "Graduation")
graduation

## # A tibble: 950 × 40
##      Age City      `Current Status`     Do you own multiple profiles on…¹ Gender
##    <dbl> <chr>     <chr>                <chr>                             <chr> 
##  1    24 Delhi     Working professional No                                Female
##  2    22 Mumbai    Working professional No                                Male  
##  3    26 Bengaluru Sabbatical           Yes                               Female
##  4    50 Delhi     Working professional No                                Male  
##  5    45 Durgapur  Sabbatical           No                                Female
##  6    25 Bengaluru Student              No                                Male  
##  7    27 Delhi     Student              Yes                               Male  
##  8    27 Bengaluru Working professional No                                Male  
##  9    45 Delhi     Working professional No                                Female
## 10    21 Delhi     Working professional No                                Female
## # ℹ 940 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

#Only people with Post graduation education
post_graduation <- subset(data, `Highest Education` == "Post graduation")
post_graduation

## # A tibble: 541 × 40
##      Age City           `Current Status`     Do you own multiple profil…¹ Gender
##    <dbl> <chr>          <chr>                <chr>                        <chr> 
##  1    39 Delhi          Working professional No                           Female
##  2    25 Vishakhapatnam Working professional Yes                          Female
##  3    52 Jaipur         Working professional No                           Male  
##  4    22 Delhi          Sabbatical           Yes                          Male  
##  5    26 Delhi          Working professional No                           Female
##  6    25 Mumbai         Sabbatical           No                           Female
##  7    22 Kolkata        Student              No                           Female
##  8    26 Kolkata        Student              No                           Female
##  9    27 Chennai        Working professional No                           Male  
## 10    32 Bengaluru      Working professional No                           Female
## # ℹ 531 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

#Only people with High School education
high_school <- subset(data, `Highest Education` == "High School")
high_school

## # A tibble: 137 × 40
##      Age City      `Current Status`     Do you own multiple profiles on…¹ Gender
##    <dbl> <chr>     <chr>                <chr>                             <chr> 
##  1    18 Jaipur    Student              No                                Male  
##  2    35 Delhi     Working professional Yes                               Female
##  3    23 Delhi     Student              No                                Female
##  4    50 Kolkata   Working professional No                                Female
##  5    17 Mumbai    Student              Yes                               Male  
##  6    16 Kolkata   Student              No                                Male  
##  7    16 Jaipur    Student              No                                Female
##  8    15 Chennai   Student              Yes                               Female
##  9    20 Hyderabad Student              Yes                               Male  
## 10    16 Mumbai    Student              No                                Male  
## # ℹ 127 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

#Only people with multiple profiles on Instagram
multiple_profiles <- subset(data, `Do you own multiple profiles on Instagram?` == "Yes")
multiple_profiles

## # A tibble: 308 × 40
##      Age City           `Current Status`     Do you own multiple profil…¹ Gender
##    <dbl> <chr>          <chr>                <chr>                        <chr> 
##  1    26 Bengaluru      Sabbatical           Yes                          Female
##  2    25 Vishakhapatnam Working professional Yes                          Female
##  3    27 Delhi          Student              Yes                          Male  
##  4    22 Delhi          Sabbatical           Yes                          Male  
##  5    25 Ahmedabad      Self Employed        Yes                          Male  
##  6    35 Delhi          Working professional Yes                          Female
##  7    17 Mumbai         Student              Yes                          Male  
##  8    23 Kolkata        Student              Yes                          Female
##  9    26 Kanpur         Working professional Yes                          Male  
## 10    22 Chennai        Student              Yes                          Female
## # ℹ 298 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

#Only people who own a single profile on Instagram
single_profile <- subset(data, `Do you own multiple profiles on Instagram?` == "No")
single_profile

## # A tibble: 1,316 × 40
##      Age City      `Current Status`     Do you own multiple profiles on…¹ Gender
##    <dbl> <chr>     <chr>                <chr>                             <chr> 
##  1    24 Delhi     Working professional No                                Female
##  2    39 Delhi     Working professional No                                Female
##  3    22 Mumbai    Working professional No                                Male  
##  4    50 Delhi     Working professional No                                Male  
##  5    52 Jaipur    Working professional No                                Male  
##  6    45 Durgapur  Sabbatical           No                                Female
##  7    25 Bengaluru Student              No                                Male  
##  8    27 Bengaluru Working professional No                                Male  
##  9    45 Delhi     Working professional No                                Female
## 10    21 Delhi     Working professional No                                Female
## # ℹ 1,306 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

#Only people who spent more than 500 minutes on Facebook in last week
more_than_500_fb <- subset(data, `Time Spent on Facebook in last week (in minutes)` > 500)
more_than_500_fb

## # A tibble: 130 × 40
##      Age City           `Current Status`     Do you own multiple profil…¹ Gender
##    <dbl> <chr>          <chr>                <chr>                        <chr> 
##  1    39 Delhi          Working professional No                           Female
##  2    26 Bengaluru      Sabbatical           Yes                          Female
##  3    50 Delhi          Working professional No                           Male  
##  4    25 Vishakhapatnam Working professional Yes                          Female
##  5    45 Durgapur       Sabbatical           No                           Female
##  6    25 Bengaluru      Student              No                           Male  
##  7    27 Bengaluru      Working professional No                           Male  
##  8    22 Delhi          Sabbatical           Yes                          Male  
##  9    18 Jaipur         Student              No                           Male  
## 10    25 Mumbai         Sabbatical           No                           Female
## # ℹ 120 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

#Only people who spent more than 500 minutes on Instagram in last week
more_than_500_insta <- subset(data, `Time Spent on Instagram in last week (in minutes)` > 500)
more_than_500_insta

## # A tibble: 559 × 40
##      Age City           `Current Status`     Do you own multiple profil…¹ Gender
##    <dbl> <chr>          <chr>                <chr>                        <chr> 
##  1    24 Delhi          Working professional No                           Female
##  2    22 Mumbai         Working professional No                           Male  
##  3    26 Bengaluru      Sabbatical           Yes                          Female
##  4    25 Vishakhapatnam Working professional Yes                          Female
##  5    25 Bengaluru      Student              No                           Male  
##  6    27 Delhi          Student              Yes                          Male  
##  7    45 Delhi          Working professional No                           Female
##  8    22 Delhi          Sabbatical           Yes                          Male  
##  9    18 Jaipur         Student              No                           Male  
## 10    25 Mumbai         Sabbatical           No                           Female
## # ℹ 549 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

#Only people who spent more than 500 minutes on WhatsApp in last week
more_than_500_whatsapp <- subset(data, `Time Spent on WhatsApp in last week (in minutes)` > 500)
more_than_500_whatsapp

## # A tibble: 913 × 40
##      Age City           `Current Status`     Do you own multiple profil…¹ Gender
##    <dbl> <chr>          <chr>                <chr>                        <chr> 
##  1    24 Delhi          Working professional No                           Female
##  2    39 Delhi          Working professional No                           Female
##  3    22 Mumbai         Working professional No                           Male  
##  4    26 Bengaluru      Sabbatical           Yes                          Female
##  5    50 Delhi          Working professional No                           Male  
##  6    25 Vishakhapatnam Working professional Yes                          Female
##  7    52 Jaipur         Working professional No                           Male  
##  8    45 Durgapur       Sabbatical           No                           Female
##  9    25 Bengaluru      Student              No                           Male  
## 10    27 Delhi          Student              Yes                          Male  
## # ℹ 903 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

#Only people who have more than 1000 followers on Instagram
more_than_1000_followers <- subset(data,`How many followers do you have on Instagram?` > 1000)
more_than_1000_followers

## # A tibble: 199 × 40
##      Age City      `Current Status`     Do you own multiple profiles on…¹ Gender
##    <dbl> <chr>     <chr>                <chr>                             <chr> 
##  1    25 Bengaluru Student              No                                Male  
##  2    23 Kolkata   Student              Yes                               Female
##  3    21 Chennai   Student              No                                Female
##  4    21 Chennai   Student              Yes                               Female
##  5    21 Mumbai    Sabbatical           Yes                               Female
##  6    57 Delhi     Sabbatical           Yes                               Male  
##  7    22 Delhi     Student              No                                Female
##  8    35 Mumbai    Working professional Yes                               Female
##  9    26 Ahmedabad Working professional No                                Male  
## 10    21 Mumbai    Student              No                                Female
## # ℹ 189 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

#Only people who have more than 100 posts on Instagram
more_than_100_posts <- subset(data, `How many posts do you have on Instagram?` > 100)
more_than_100_posts

## # A tibble: 444 × 40
##      Age City           `Current Status`     Do you own multiple profil…¹ Gender
##    <dbl> <chr>          <chr>                <chr>                        <chr> 
##  1    25 Vishakhapatnam Working professional Yes                          Female
##  2    25 Bengaluru      Student              No                           Male  
##  3    25 Ahmedabad      Self Employed        Yes                          Male  
##  4    50 Kolkata        Working professional No                           Female
##  5    21 Chennai        Student              No                           Female
##  6    25 Kolkata        Working professional No                           Male  
##  7    45 Bagdogra       Working professional Yes                          Female
##  8    38 Bengaluru      Working professional No                           Female
##  9    24 Kolkata        Student              Yes                          Male  
## 10    21 Chennai        Student              Yes                          Female
## # ℹ 434 more rows
## # ℹ abbreviated name: ¹`Do you own multiple profiles on Instagram?`
## # ℹ 35 more variables: `Highest Education` <chr>,
## #   `Location (City Airport Code)` <chr>, `Phone OS` <chr>, State <chr>,
## #   Zone <chr>, `How many followers do you have on Instagram?` <dbl>,
## #   `How many posts do you have on Instagram?` <dbl>, Latitude <dbl>,
## #   Longitude <dbl>, …

The scatter plot shows the relationship between the number of followers and the number of posts for Instagram users with more than 1000 followers. The blue points represent users with more than 1000 followers, and the red points represent users with more than 100 posts but fewer than 1000 followers.

plot(data[data$`How many followers do you have on Instagram?` > 1000, ]$`How many followers do you have on Instagram?`, data[data$`How many followers do you have on Instagram?` > 1000, ]$`How many posts do you have on Instagram?`, col = "blue", xlab = "Number of followers", ylab = "Number of posts")
points(data[data$`How many followers do you have on Instagram?` <= 1000 & data$`How many posts do you have on Instagram?` > 100, ]$`How many followers do you have on Instagram?`, data[data$`How many followers do you have on Instagram?` <= 1000 & data$`How many posts do you have on Instagram?` > 100, ]$`How many posts do you have on Instagram?`, col = "red")
legend("topright", legend = c("More than 1000 followers", "More than 100 posts"), col = c("blue", "red"), pch = 1)

require(ggplot2)

## Loading required package: ggplot2

# Create a pie chart for gender distribution
ggplot(data, aes(x = "", fill = Gender)) +
  geom_bar(width = 1) +
  coord_polar(theta = "y") +
  labs(title = "Gender Distribution", fill = "Gender")

## The pie chart shows the gender distribution of a user. The pie is divided into three slices, representing females, males, and non-binary individuals. The slices are colored green, blue, and light red color

# Create a histogram for age distribution
ggplot(data, aes(x = Age, fill = factor(Age))) +
  geom_histogram(binwidth = 5, color = "black") +
  scale_fill_viridis_d() +
  labs(title = "Age Distribution", x = "Age", y = "Count") +
  theme_minimal()

The age distribution chart shows the number of people in each age group for a given population. The x-axis shows the age group, and the y-axis shows the number of people in that age group. The bars in the chart are colored according to the age group

# Create a bar chart with education level
ggplot(data, aes(x = `Highest Education`, fill = Gender)) +
  geom_bar(color = "black", size = 0.5, width = 0.7, position = position_dodge()) +
  labs(title = "Education Level Distribution", x = "Education Level", y = "Count") +
  theme_minimal() +
  theme(legend.position = "top", legend.title = element_blank()) +
  scale_fill_manual(values = c("#FFC0CB", "#ADD8E6", "#90EE90")) +
  guides(fill = guide_legend(reverse = TRUE)) +
  geom_text(aes(label=after_stat(count)), stat='count', position=position_dodge(width=0.7), vjust=-0.5, size=3)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## The bar chart shows the distribution of education levels for males, females, and non-binary people in the data. The x-axis shows the education level, and the y-axis shows the count of people. The bars are grouped by gender, and the colors of the bars represent the gender (pink for females, blue for males, and green for non-binary people).

# Create a bar chart with phone operating system
ggplot(data, aes(x = `Phone OS`, fill = Gender)) +
  geom_bar() +
  labs(title = "Phone Operating System Distribution", x = "Phone Operating System", y = "Count")

## The bar chart shows the distribution of phone operating systems by gender. The y-axis shows the count of people using each operating system, and the x-axis shows the operating system. The bars are colored by gender. The chart shows that Android is the most popular operating system for all gender

library(ggplot2)

# Create a ggplot bar chart for Gender by Zone/Area with Total Social Media Usage
ggplot(data, aes(x = `Zone`, y = `Total Social Media Usage`, fill = Gender)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "Zone/Area", y = "Total Social Media Usage", title = "Gender by Zone/Area with Total Social Media Usage") +
  scale_fill_brewer(palette = "Set3") +  # Choose a colorful palette
  theme_minimal()

## The bar graph shows the total social media usage by gender, by zone/area. The zones/areas are Central, Eastern, North-Eastern, Northern, Southern, and Western. the genders are Female, Male, and Non-Binary. The total social media usage is highest in the Western zone, The total social media usage is lowest in the central zone .

# Create a pie chart for phone operating system distribution
ggplot(data, aes(x = "", fill = `Phone OS`)) +
  geom_bar(width = 1) +
  coord_polar(theta = "y") +
  labs(title = "Phone Operating System Distribution", fill = "Phone Operating System")

## The pie chart shows the distribution of phones by operating system. The largest slice is for Android and lowest slice in others phone operating system

# Create a scatter plot for time spent on Facebook and Instagram
ggplot(data, aes(x = `Time Spent on Facebook in last week (in minutes)`, y = `Time Spent on Instagram in last week (in minutes)`)) +
  geom_point(color = "#FFC0CB") +
  labs(title = "Time Spent on Facebook vs. Time Spent on Instagram", x = "Time Spent on Facebook (in minutes)", y = "Time Spent on Instagram (in minutes)")

## the scatter plot shows a positive correlation between time spent on Facebook and time spent on Instagram. This means that people who spend more time on Facebook also spend more time on Instagram. The scatter plot also shows that there are more people who spend more time on Facebook than on Instagram.

# Create a histogram for social media usage distribution
ggplot(data, aes(x = `Total Social Media Usage`)) +
  geom_histogram(binwidth = 500, fill = "#FFC0CB", color = "black") +
  labs(title = "Social Media Usage Distribution", x = "Total Social Media Usage (in minutes)", y = "Count") +
  theme_minimal() +
  theme(plot.background = element_rect(fill = "#ADD8E6"),
        axis.text = element_text(size = 12, color = "black"),
        axis.title = element_text(size = 14, color = "black"),
        plot.title = element_text(size = 16, color = "black"))

This is histogram of social media usage distribution. It shows the number of people who use social media for a certain amount of time each day. The x-axis of the chart shows the total social media usage in minutes, and the y-axis shows the number of people who use social media for that amount of time.

library(ggplot2)

# Create a ggplot bar chart
ggplot(data, aes(x=`Profession/Activity`, y=`Total Social Media Usage`, fill=`Profession/Activity`)) +
  geom_bar(stat="identity") +
  labs(x="Profession/Activity", y="Total Social Media Usage", title="Total Social Media Usage by Profession/Activity") +
  theme_minimal() +
  scale_fill_brewer(palette="Set3")  # Choose a colorful palette

## The chart shows the total social media usage by profession/activity. The x-axis shows the different professions/activities, and the y-axis shows the total social media usage in hours per week. The bars are colored by profession/activity.

# Create a histogram for time spent on WhatsApp distribution
ggplot(data, aes(x = `Time Spent on WhatsApp in last week (in minutes)`, fill = Gender)) +
  geom_histogram(binwidth = 100) +
  labs(title = "Time Spent on WhatsApp Distribution", x = "Time Spent on WhatsApp (in minutes)", y = "Count")

the graph shows that WhatsApp is a popular communication platform for people of all genders, with females spending slightly more time on it than males.

# Create a bar chart for education level and social media usage
ggplot(data, aes(x = `Highest Education`, y = `Total Social Media Usage`, fill = `Highest Education`)) +
  geom_bar(stat = "summary", fun = "mean") +
  labs(title = "Education Level vs. Social Media Usage", x = "Education Level", y = "Total Social Media Usage (in minutes)") +
  scale_fill_manual(values = c("#FFC0CB", "#ADD8E6", "#90EE90", "#FFD700")) +
  theme_minimal()

## The chart shows the total social media usage (in minutes) of people with different levels of education. The bar graph shows that people with higher education levels use social media more than those with other education levels.

# Distribution of the number of posts on Instagram
ggplot(data, aes(x = `How many posts do you have on Instagram?`)) +
  geom_histogram(binwidth = 50, fill = "#ADD8E6") +
  labs(title = "Number of Posts on Instagram Distribution", x = "Number of Posts on Instagram", y = "Count") +
  theme_minimal()

## The shows the distribution of Instagram posts by country. The x-axis shows the number of posts on Instagram , and the y-axis shows the count.

# Relationship between age and the number of posts on Instagram
ggplot(data, aes(x = Age, y = `How many posts do you have on Instagram?`, color = Age)) +
  geom_point() +
  labs(title = "Age vs Number of Posts on Instagram", x = "Age", y = "Number of Posts on Instagram")

## This chart shows the relationship between age and the number of posts on Instagram. The x-axis represents the age of the user, and the y-axis represents the number of posts that the user has made on Instagram. The chart shows that, younger users post more on Instagram than older users. There are a few possible explanations for this trend. First, younger users may simply have more time to spend on social media. Second, younger users may be more likely to use social media to connect with their friends and family. Third, younger users may be more likely to use social media to express themselves and share their creativity.

# Relationship between age and the time spent on Facebook
ggplot(data, aes(x = Age, y = `Time Spent on Facebook in last week (in minutes)`, color = Age)) +
  geom_point() +
  labs(title = "Age vs Time Spent on Facebook", x = "Age", y = "Time Spent on Facebook (in minutes)")

## the bar chart shows the distribution of time spent on Facebook in the last week, measured in minutes. and the height of each bar represents the number of people who spent that amount of time on Facebook. with a majority of users spending a relatively small amount of time on Facebook each week and a smaller group of users spending a significant amount of time on Facebook each week.

# Relationship between age and the time spent on Instagram
ggplot(data, aes(x = Age, y = `Time Spent on Instagram in last week (in minutes)`, color = Age)) +
  geom_point() +
  labs(title = "Age vs Time Spent on Instagram", x = "Age", y = "Time Spent on Instagram (in minutes)")

## The chart shows a positive correlation between age and time spent on Instagram, meaning that younger users spend more time on the platform than older age users. One possibility is that younger users have more time to spend on social media in general.

# Distribution of the total social media usage
ggplot(data, aes(x = `Total Social Media Usage`, fill = Gender)) +
  geom_histogram(binwidth = 500) +
  labs(title = "Total Social Media Usage Distribution", x = "Total Social Media Usage", y = "Count") +
  scale_fill_manual(values = c("#ADD8E6", "#FFC0CB", "#FF0000")) +  # Add a third color value
  theme_minimal() +
  theme(legend.position = "bottom")

The chart shows the distribution of total social media usage by gender. It is a histogram, with the x-axis showing total social media usage and the y-axis showing the number of people. The bars are color-coded by gender, with blue representing females, pink representing males, and red representing non-binary individuals.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ stringr   1.5.0
## ✔ forcats   1.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(maps)

## Warning: package 'maps' was built under R version 4.3.2

## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:purrr':
## 
##     map

data <- read_csv("C:/Users/bhaga/OneDrive/Desktop/new dataset/social media_new.csv")

## Rows: 1628 Columns: 40
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): City, Current Status, Do you own multiple profiles on Instagram?, ...
## dbl (11): Age, Latitude, Longitude, How many subscriber do you have on youtu...
## num (14): How many followers do you have on Instagram?, How many posts do yo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Create a ggplot object with your dataset
p <- ggplot(data, aes(x = Longitude, y = Latitude, color = State, fill = State)) +
  geom_point(size = 3) +  # Add points to represent data with size
  coord_fixed(ratio = 1) +  # Aspect ratio
  theme_minimal()  # Plain background theme

# Add India map outline
india_map <- map_data("world", region = "India")
p <- p + geom_polygon(data = india_map, aes(x = long, y = lat, group = group), fill = NA, color = "black")

print(p)

## The chart shows the social media users in India, by state. the color of the point represents the state.The states with the most social media users are Uttar Pradesh, Maharashtra, and West Bengal. The states with the fewest social media users are Jammu and Kashmir, Uttarakhand, and Himachal Pradesh.

# Relationship between age and the total social media usage
library(ggplot2)

# Create a scatter plot with color and a specific theme
ggplot(data, aes(x = Age, y = `Total Social Media Usage`, color = Age)) +
  geom_point() +
  labs(title = "Age vs Total Social Media Usage", x = "Age", y = "Total Social Media Usage")

## The chart shows a scatter plot of age vs total social media usage. The data points are colored by age, with younger people represented by darker colors and older people represented by lighter colors.The overall shows that younger people use social media more than older people. where the x-axis (age) and the y-axis (total social media usage). One possibility is that younger people are more likely with social media and more comfortable using it.

# Relationship between the number of followers on Instagram and the number of posts on Instagram
ggplot(data, aes(x = `How many followers do you have on Instagram?`, y = `How many posts do you have on Instagram?`)) +
  geom_point() +
  labs(title = "Number of Followers vs Number of Posts on Instagram", x = "Number of Followers on Instagram", y = "Number of Posts on Instagram")

## The chart shows a positive correlation between the number of followers on Instagram and the number of posts on Instagram, Instagram users with more followers have more posts. with some users with a high number of followers having a low number of posts.

# Pie chart for current status distribution
ggplot(data, aes(x = "", fill = `Current Status`)) +
  geom_bar(width = 1) +
  coord_polar(theta = "y") +
  labs(title = "Current Status Distribution", fill = "Current Status")

## The pie chart shows the distribution of people in different current status, such as Sabbatical, Self Employed, Student, and Working professional. The chart is labeled “Current Status Distribution” and the legend shows the colors used to represent each status.

#.******************************************************************************************************************************************


# Read the CSV file
data <- read_csv("C:/Users/bhaga/OneDrive/Desktop/new dataset/social media_new.csv")

## Rows: 1628 Columns: 40
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): City, Current Status, Do you own multiple profiles on Instagram?, ...
## dbl (11): Age, Latitude, Longitude, How many subscriber do you have on youtu...
## num (14): How many followers do you have on Instagram?, How many posts do yo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Load the ggplot2 package
library(ggplot2)
library(tidyverse)
# Sample data
data <- data <- data %>% select(Age,Gender,`Total WhatsApp Usage`,`Total Social Media Usage`,`Total Facebook Usage`,`Total Instagram Usage`,`Total YouTube Usage (minutes)`)



# Create a barplot for Age vs. Total Facebook Usage with Gender as a legend
ggplot(data, aes(x = Age, y = `Total Facebook Usage`, fill = Gender)) +
  geom_bar(stat = "identity") +
  labs(title = "Total Facebook Usage by Age and Gender", x = "Age Group", y = "Total Usage (minutes)") +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "pink")) +
  theme_minimal()

The bar chart shows the total Facebook usage by age and gender. The x-axis shows the age group, and the y-axis shows the total Facebook usage in minutes. The bars are colored blue for males and pink for females. the chart shows that younger user are use more The highest total usage is in the 20-29 age group

# Create a barplot for Age vs. Total Instagram Usage with Gender as a legend
ggplot(data, aes(x = Age, y = `Total Instagram Usage`, fill = Gender)) +
  geom_bar(stat = "identity") +
  labs(title = "Total Instagram Usage by Age and Gender", x = "Age Group", y = "Total Usage (minutes)") +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
  theme_minimal()

## The bar chart shows the total Instagram usage by age and gender. The chart is labeled “Total Instagram Usage by Age and Gender” and the x-axis shows the age group, while the y-axis shows the total instagram usage in minutes. The chart is color-coded by gender, with male users in blue and female users in red.

# Create a barplot for Age vs. Total WhatsApp Usage with Gender as a legend
ggplot(data, aes(x = Age, y = `Total WhatsApp Usage`, fill = Gender)) +
  geom_bar(stat = "identity") +
  labs(title = "Total WhatsApp Usage by Age and Gender", x = "Age Group", y = "Total Usage (minutes)") +
  scale_fill_manual(values = c("Male" = "maroon", "Female" = "pink")) +
  theme_minimal()

## The bar chart shows the total WhatsApp usage by age and gender. The x-axis shows the age group, and the y-axis shows the total usage in minutes. The bars are colored with gender, maroon for males and pink for females. The chart shows that overall, female use WhatsApp more than male

# Create a barplot for Age vs. Total YouTube Usage with Gender as a legend
ggplot(data, aes(x = Age, y = `Total YouTube Usage (minutes)`, fill = Gender)) +
  geom_bar(stat = "identity") +
  labs(title = "Total YouTube Usage by Age and Gender", x = "Age Group", y = "Total Usage (minutes)") +
  scale_fill_manual(values = c("Male" = "maroon", "Female" = "skyblue")) +
  theme_minimal()

## The bar chart you have sent shows the total YouTube usage by age and gender. The chart is labeled “Total YouTube Usage by Age and Gender” and the x-axis shows the age group, while the y-axis shows the total usage in minutes. The chart is colored by gender, with male users in maroon and female users in sky-blue. the chart shows that Female users use YouTube more than male users in all age groups

# Combine all the data into a long format
data_long <- data %>%
  pivot_longer(cols = c(`Total Facebook Usage`, `Total Instagram Usage`, `Total WhatsApp Usage`, `Total YouTube Usage (minutes)`),
               names_to = "Platform", values_to = "Total Usage")

# Create a single plot with facets
ggplot(data_long, aes(x = Age, y = `Total Usage`, fill = Gender)) +
  geom_bar(stat = "identity") +
  labs(title = "Social Media Usage by Age and Gender", x = "Age Group", y = "Total Usage (minutes)") +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
  theme_minimal() +
  facet_wrap(~ Platform, scales = "free_y")

Here all combined bar chart shows the total usage of four social media platforms (Facebook, Instagram, WhatsApp, and YouTube) by age and gender in minutes.

library(readr)
data <- read_csv("C:/Users/bhaga/OneDrive/Desktop/new dataset/social media_new.csv")

## Rows: 1628 Columns: 40
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): City, Current Status, Do you own multiple profiles on Instagram?, ...
## dbl (11): Age, Latitude, Longitude, How many subscriber do you have on youtu...
## num (14): How many followers do you have on Instagram?, How many posts do yo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Load required library
library(ggplot2)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

# Create a data frame with the given data
data <- data[, c("Age", "Income from YouTube (rs,month)")]

# Perform k-means clustering with 2 clusters
set.seed(123)  # For reproducibility
k <- 2
kmeans_result <- kmeans(data, centers = k)

# Add the cluster assignments to the original data
data$Cluster <- as.factor(kmeans_result$cluster)

# Print the cluster assignments
print(data)

## # A tibble: 1,628 × 3
##      Age `Income from YouTube (rs,month)` Cluster
##    <dbl>                            <dbl> <fct>  
##  1    24                            88447 2      
##  2    39                            64764 2      
##  3    22                             4387 1      
##  4    26                            99695 2      
##  5    50                            81297 2      
##  6    25                            51770 2      
##  7    52                            38003 1      
##  8    45                            54216 2      
##  9    25                            48149 1      
## 10    27                            12657 1      
## # ℹ 1,618 more rows

# Elbow Method to determine the optimal number of clusters
wcss <- vector("numeric", length = 10)  # Initialize a vector to store Within-Cluster Sum of Squares (WCSS)

# Calculate WCSS for different numbers of clusters (k)
for (i in 1:10) {
  kmeans_temp <- kmeans(data[, c("Age", "Income from YouTube (rs,month)")], centers = i)
  wcss[i] <- kmeans_temp$tot.withinss  # Store the WCSS for the current k
}

# Plot the Elbow Method graph
plot(1:10, wcss, type = "b", xlab = "K (Number of Clusters)", ylab = "WCSS (Within-Cluster Sum of Squares)")  # Set x and y axis labels
abline(v = k, col = "red", lty = 2)  # Highlight the chosen k

# Visualization of the clustering with centroids
ggplot(data, aes(x = Age, y = `Income from YouTube (rs,month)`, color = Cluster)) +
  geom_point() +
  geom_point(data = as.data.frame(kmeans_result$centers), aes(x = Age, y = `Income from YouTube (rs,month)`), color = "darkred", size = 3, shape = 4) +
  labs(title = "K-Means Clustering (k = 2) with Centroids", x = "Age", y = "Income from YouTube") +
  scale_color_manual(values = c("blue", "green"))

## The chart shows the results of the Elbow Method for determining the optimal number of clusters for k-means clustering. The Elbow Method plots the Within-Cluster Sum of Squares (WCSS) for different numbers of clusters (k). The WCSS is a measure of how well the data points fit into their assigned clusters. A lower WCSS indicates that the data points are more tightly clustered around their centroids.

The chart is a k-means clustering of data points based on their age and income from YouTube. K-means clustering is an unsupervised learning algorithm. The chart shows that the data points are clustered into two groups, with the blue cluster representing lower incomes and the green cluster representing higher incomes. The centroids of the two clusters are marked by the dark red X points.

library(readr)
library(dplyr)
library(ggplot2)

# Read the CSV file
data <- read_csv("C:/Users/bhaga/OneDrive/Desktop/new dataset/social media_new.csv")

## Rows: 1628 Columns: 40
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): City, Current Status, Do you own multiple profiles on Instagram?, ...
## dbl (11): Age, Latitude, Longitude, How many subscriber do you have on youtu...
## num (14): How many followers do you have on Instagram?, How many posts do yo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Select the 'Age' and 'Total Social Facebook Usage' variables
data <- data %>% select(Age, `Total Facebook Usage`)

# Remove any rows with missing values
data <- na.omit(data)

# Convert the 'Age' variable to numeric
data$Age <- as.numeric(data$Age)

# Create a linear regression model
model <- lm(`Total Facebook Usage` ~ Age, data = data)

# Predict the next 5-year value
new_data <- data.frame(Age = c(31, 32, 33, 34, 35))
prediction <- predict(model, newdata = new_data)

# Add a 'Group' column to distinguish older and younger based on age threshold
data <- data %>%
  mutate(Group = ifelse(Age > 35, "Older", "Younger"))
# Create a scatter plot with a smoothing line, colored by 'Group'
ggplot(data, aes(x = Age, y = `Total Facebook Usage`, color = Group)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  geom_point(data = new_data, aes(x = Age, y = prediction), color = "red", size = 3) +  # Add points for predicted ages
  labs(title = "Age vs. Total Facebook Usage", x = "Age", y = "Total Facebook Usage") +
  scale_color_manual(values = c("Younger" = "green", "Older" = "purple")) +
  geom_vline(xintercept = 35, linetype = "dashed")  # Add a vertical dashed line at age 35

## `geom_smooth()` using formula = 'y ~ x'

## Here I use linear regression shows the relationship between age and total Facebook usage. The data is divided into two groups: younger users (age 35 or below) and older users (age 36 or above). The vertical dashed line at age 35 separates the two groups. Younger and older age groups , younger with green color and older with purple color . The blue line represents the linear regression model that model is predict total Facebook usage based on age. The chart shows that there is a positive correlation between age and total Facebook usage. This means that as people get older use Facebook more. This is likely because older people have more time to spend on social media and are more likely to have friends and family members who use Facebook. red data points are used to predict the values for ages 31, 32, 33, 34, and 35 based on the linear regression model.

library(readr)
library(dplyr)
library(ggplot2)

# Read the CSV file
data <- read_csv("C:/Users/bhaga/OneDrive/Desktop/new dataset/social media_new.csv")

## Rows: 1628 Columns: 40
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): City, Current Status, Do you own multiple profiles on Instagram?, ...
## dbl (11): Age, Latitude, Longitude, How many subscriber do you have on youtu...
## num (14): How many followers do you have on Instagram?, How many posts do yo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Select the 'Age' and 'Total Instagram Usage' variables
data <- data %>% select(Age, `Total Instagram Usage`)

# Remove any rows with missing values
data <- na.omit(data)

# Convert the 'Age' variable to numeric
data$Age <- as.numeric(data$Age)

# Create a linear regression model
model <- lm(`Total Instagram Usage` ~ Age, data = data)

# Predict the next 5-year value
new_data <- data.frame(Age = c(31, 32, 33, 34, 35))
prediction <- predict(model, newdata = new_data)

# Add a 'Group' column to distinguish older and younger based on age threshold
data <- data %>%
  mutate(Group = ifelse(Age > 35, "Older", "Younger"))

# Create a scatter plot with a smoothing line, colored by 'Group'
ggplot(data, aes(x = Age, y = `Total Instagram Usage`, color = Group)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE,color = "blue") +
  geom_point(data = new_data, aes(x = Age, y = prediction), color = "red", size = 3) +
  labs(title = "Age vs. Total Instagram Usage", x = "Age", y = "Total Instagram Usage") +
  scale_color_manual(values = c("Younger" = "maroon", "Older" = "blue")) +
  geom_vline(xintercept = 35, linetype = "dashed")  # Add a vertical dashed line at age 35

## `geom_smooth()` using formula = 'y ~ x'

## Here I use linear regression shows the relationship between age and total Instagram usage. The dataset is segmented into two distinct groups: younger users (aged 35 or below) represented in purple, and older users (aged 36 or above) shown in blue. A vertical dashed line at age 35 separates these two groups. The blue line in the chart represents the linear regression model, which predicts total Instagram usage based on age, they younger use Instagram more frequently. red data points are used to predict the values for ages 31, 32, 33, 34, and 35 based on the linear regression model.

library(readr)
library(dplyr)
library(ggplot2)

# Read the CSV file
data <- read_csv("C:/Users/bhaga/OneDrive/Desktop/new dataset/social media_new.csv")

## Rows: 1628 Columns: 40
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): City, Current Status, Do you own multiple profiles on Instagram?, ...
## dbl (11): Age, Latitude, Longitude, How many subscriber do you have on youtu...
## num (14): How many followers do you have on Instagram?, How many posts do yo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Select the 'Age' and 'Total WhatsApp Usage' variables
data <- data %>% select(Age, `Total WhatsApp Usage`)

# Remove any rows with missing values
data <- na.omit(data)

# Convert the 'Age' variable to numeric
data$Age <- as.numeric(data$Age)

# Create a linear regression model
model <- lm(`Total WhatsApp Usage` ~ Age, data = data)

# Predict the next 5-year value
new_data <- data.frame(Age = c(31, 32, 33, 34, 35))
prediction <- predict(model, newdata = new_data)

# Add a 'Group' column to distinguish older and younger based on age threshold
data <- data %>%
  mutate(Group = ifelse(Age > 35, "Older", "Younger"))

# Create a scatter plot with a smoothing line, colored by 'Group'
ggplot(data, aes(x = Age, y = `Total WhatsApp Usage`, color = Group)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE,color="blue") +
  geom_point(data = new_data, aes(x = Age, y = prediction), color = "green", size = 3) +
  labs(title = "Age vs. Total WhatsApp Usage", x = "Age", y = "Total WhatsApp Usage") +
  scale_color_manual(values = c("Younger" = "red", "Older" = "blue")) +
  geom_vline(xintercept = 35, linetype = "dashed")  # Add a vertical dashed line at age 35

## `geom_smooth()` using formula = 'y ~ x'

a linear regression model to explore the relationship between age and total WhatsApp usage. The dataset is focus on two key variables: ‘Age’ and ‘Total WhatsApp Usage.’ Rows with A linear regression model is then created, which predict ‘Total WhatsApp Usage’ based on an individual’s age. To gain expected usage in the coming five years and. A vertical dashed line is introduced at the age of 35. Data points are colored differently between the two age groups, with ‘Younger’ users represented in red and ‘Older’ users in blue. The blue line represents the linear regression model, showcasing how age influences WhatsApp usage, green data points are used to predict WhatsApp usage for ages 31, 32, 33, 34, and 35, offering valuable understandings into the future trends in WhatsApp usage based on age.

library(readr)
library(dplyr)
library(ggplot2)

# Read the CSV file
data <- read_csv("C:/Users/bhaga/OneDrive/Desktop/new dataset/social media_new.csv")

## Rows: 1628 Columns: 40
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): City, Current Status, Do you own multiple profiles on Instagram?, ...
## dbl (11): Age, Latitude, Longitude, How many subscriber do you have on youtu...
## num (14): How many followers do you have on Instagram?, How many posts do yo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Select the 'Age' and 'Total Social Media Usage' variables
data <- data %>% select(Age, `Total Social Media Usage`)

# Remove any rows with missing values
data <- na.omit(data)

# Convert the 'Age' variable to numeric
data$Age <- as.numeric(data$Age)

# Create a linear regression model
model <- lm(`Total Social Media Usage` ~ Age, data = data)

# Predict the next 5-year value
new_data <- data.frame(Age = c(31, 32, 33, 34, 35))
prediction <- predict(model, newdata = new_data)

# Add a 'Group' column to distinguish older and younger based on age threshold
data <- data %>%
  mutate(Group = ifelse(Age > 35, "Older", "Younger"))

# Create a scatter plot with a smoothing line, colored by 'Group'
ggplot(data, aes(x = Age, y = `Total Social Media Usage`, color = Group)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE,color = "blue") +
  geom_point(data = new_data, aes(x = Age, y = prediction), color = "yellow", size = 3) +
  labs(title = "Age vs. Total Social Media Usage", x = "Age", y = "Total Social Media Usage") +
  scale_color_manual(values = c("Younger" = "red", "Older" = "blue")) +
  geom_vline(xintercept = 35, linetype = "dashed")  # Add a vertical dashed line at age 35

## `geom_smooth()` using formula = 'y ~ x'

## a linear regression model to explore the relationship between age and total social media usage. selecting the ‘Age’ and ‘Total Social Media Usage’ variables. The core of our analysis is a linear regression model that models ‘Total Social Media Usage’ based on age. I use this model to predict the expected usage for the next five years, with ages 31, 32, 33, 34, and 35 included in the new data. data points are color between ‘Younger’ users in red and ‘Older’ users in blue. The blue line represents the linear regression model, yellow data points are used to predict social media usage for ages 31, 32, 33, 34, and 35, expected future trends based on age.

library(readr)
library(dplyr)
library(ggplot2)

# Read the CSV file
data <- read_csv("C:/Users/bhaga/OneDrive/Desktop/new dataset/social media_new.csv")

## Rows: 1628 Columns: 40
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): City, Current Status, Do you own multiple profiles on Instagram?, ...
## dbl (11): Age, Latitude, Longitude, How many subscriber do you have on youtu...
## num (14): How many followers do you have on Instagram?, How many posts do yo...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Select the 'Age' and social media usage variables
data <- data %>% select(Age, `Total Facebook Usage`, `Total Social Media Usage`, `Total Instagram Usage`, `Total WhatsApp Usage`)

# Remove any rows with missing values
data <- na.omit(data)

# Convert the 'Age' variable to numeric
data$Age <- as.numeric(data$Age)

# Create linear regression models for each platform and predict the next 5-year values
platforms <- c("Facebook", "Social Media", "Instagram", "WhatsApp")
for (platform in platforms) {
  model <- lm(paste0("`Total ", platform, " Usage` ~ Age"), data = data)
  new_data <- data.frame(Age = c(31, 32, 33, 34, 35))
  prediction <- predict(model, newdata = new_data)
  cat("Predicted values for Total", platform, "Usage:", prediction, "\n")
}

## Predicted values for Total Facebook Usage: 284.0396 292.0368 300.0339 308.031 316.0281 
## Predicted values for Total Social Media Usage: 2002.076 1973.391 1944.706 1916.022 1887.337 
## Predicted values for Total Instagram Usage: 615.9642 590.8018 565.6394 540.477 515.3145 
## Predicted values for Total WhatsApp Usage: 1102.072 1090.553 1079.033 1067.514 1055.994

# Melt the data for easier plotting
data_long <- data %>%
  pivot_longer(-Age, names_to = "Platform", values_to = "Usage")

# Add a 'Group' column to distinguish older and younger based on age threshold
data_long <- data_long %>%
  mutate(Group = ifelse(Age > 35, "Older", "Younger"))

# Create a single screen with all four scatter plots and linear regression models
ggplot(data_long, aes(x = Age, y = Usage, color = Group)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE,color="blue") +
  labs(title = "Age vs. Social Media Usage by Platform", x = "Age", y = "Usage") +
  scale_color_manual(values = c("Younger" = "red", "Older" = "blue")) +
  geom_vline(xintercept = 35, linetype = "dashed") +  # Add a vertical dashed line at age 35
  facet_wrap(~Platform, scales = "free_y")

## `geom_smooth()` using formula = 'y ~ x'

## This combined chart provides understandings how age influences social media usage across multiple platforms, including Facebook, Total social media, Instagram, and WhatsApp, this chart allows us to compare how age impacts social media usage across various platforms and gain understandings into expected usage trends as people age.

#thank you mam
ggplot() +
  geom_text(aes(x = 0.5, y = 0.5, label = "Thank You Mam", size = 10)) +
  theme_void()