Reddit Data Experiments

Notes:

# Read in the Data
reddit <- read.csv('reddit.csv')

# Describe the Employment Status
table(reddit$employment.status)
## 
##                    Employed full time 
##                                 14814 
##                             Freelance 
##                                  1948 
## Not employed and not looking for work 
##                                   682 
##    Not employed, but looking for work 
##                                  2087 
##                               Retired 
##                                    85 
##                               Student 
##                                 12987
# Summary
summary(reddit)
##        id            gender          age.range    
##  Min.   :    1   Min.   :0.0000   18-24   :15802  
##  1st Qu.: 8189   1st Qu.:0.0000   25-34   :11575  
##  Median :16380   Median :0.0000   Under 18: 2330  
##  Mean   :16379   Mean   :0.1885   35-44   : 2257  
##  3rd Qu.:24568   3rd Qu.:0.0000   45-54   :  502  
##  Max.   :32756   Max.   :1.0000   (Other) :  200  
##                  NA's   :201      NA's    :   88  
##                                   marital.status 
##  Engaged                                 : 1109  
##  Forever Alone                           : 5850  
##  In a relationship                       : 9828  
##  Married/civil union/domestic partnership: 5490  
##  Single                                  :10428  
##  Widowed                                 :   44  
##  NA's                                    :    5  
##                              employment.status military.service
##  Employed full time                   :14814   No  :30526      
##  Freelance                            : 1948   Yes : 2223      
##  Not employed and not looking for work:  682   NA's:    5      
##  Not employed, but looking for work   : 2087                   
##  Retired                              :   85                   
##  Student                              :12987                   
##  NA's                                 :  151                   
##  children                                  education    
##  No  :27488   Bachelor's degree                 :11046  
##  Yes : 5047   Some college                      : 9600  
##  NA's:  219   Graduate or professional degree   : 4722  
##               High school graduate or equivalent: 3272  
##               Some high school                  : 1924  
##               (Other)                           : 2046  
##               NA's                              :  144  
##            country             state                    income.range 
##  United States :20967             :11908   Under $20,000      :7892  
##  Canada        : 2888   California: 3401   $50,000 - $69,999  :4133  
##  United Kingdom: 1782   Texas     : 1541   $70,000 - $99,999  :4101  
##  Australia     : 1051   New York  : 1418   $100,000 - $149,999:3522  
##  Germany       :  407   Illinois  :  976   $20,000 - $29,999  :3206  
##  (Other)       : 5482   Washington:  910   (Other)            :8285  
##  NA's          :  177   (Other)   :12600   NA's               :1615  
##                fav.reddit               dog.cat            cheese    
##                     : 4335   I like cats.   :11156   Other    :6563  
##  askreddit          : 2123   I like dogs.   :17151   Cheddar  :6102  
##  fffffffuuuuuuuuuuuu: 1746   I like turtles.: 4442   Brie     :3742  
##  pics               : 1651   NA's           :    5   Provolone:3456  
##  trees              : 1311                           Swiss    :3214  
##  (Other)            :21562                           (Other)  :9672  
##  NA's               :   26                           NA's     :   5
# Print Age Range Data
# reddit$age.range

# Interrogate the Levels for age and income
levels(reddit$age.range)
## [1] "18-24"       "25-34"       "35-44"       "45-54"       "55-64"      
## [6] "65 or Above" "Under 18"
levels(reddit$income.range)
## [1] "$100,000 - $149,999" "$150,000 or more"    "$20,000 - $29,999"  
## [4] "$30,000 - $39,999"   "$40,000 - $49,999"   "$50,000 - $69,999"  
## [7] "$70,000 - $99,999"   "Under $20,000"
# Spot Check Factor and Numberic tests on age range
is.factor(reddit$age.range)
## [1] TRUE
is.numeric(reddit$age.range)
## [1] FALSE
# Show structure of data set
str(reddit)
## 'data.frame':    32754 obs. of  14 variables:
##  $ id               : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ gender           : int  0 0 1 0 1 0 0 0 0 0 ...
##  $ age.range        : Factor w/ 7 levels "18-24","25-34",..: 2 2 1 2 2 2 2 1 3 2 ...
##  $ marital.status   : Factor w/ 6 levels "Engaged","Forever Alone",..: NA NA NA NA NA 4 3 4 4 3 ...
##  $ employment.status: Factor w/ 6 levels "Employed full time",..: 1 1 2 2 1 1 1 4 1 2 ...
##  $ military.service : Factor w/ 2 levels "No","Yes": NA NA NA NA NA 1 1 1 1 1 ...
##  $ children         : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ education        : Factor w/ 7 levels "Associate degree",..: 2 2 5 2 2 2 5 2 2 5 ...
##  $ country          : Factor w/ 439 levels " Canada"," Canada eh",..: 394 394 394 394 394 394 125 394 394 125 ...
##  $ state            : Factor w/ 53 levels "","Alabama","Alaska",..: 33 33 48 33 6 33 1 6 33 1 ...
##  $ income.range     : Factor w/ 8 levels "$100,000 - $149,999",..: 2 2 8 2 7 2 NA 7 2 7 ...
##  $ fav.reddit       : Factor w/ 1834 levels "","'home' page (or front page if you prefer)",..: 720 691 1511 1528 188 691 1318 571 1629 1 ...
##  $ dog.cat          : Factor w/ 3 levels "I like cats.",..: NA NA NA NA NA 2 2 2 1 1 ...
##  $ cheese           : Factor w/ 11 levels "American","Brie",..: NA NA NA NA NA 3 3 1 10 7 ...
# Re-Arrange Age Range
# mydf$task <- factor(mydf$task, levels = c("up", "down", "left", "right", "front", "back"))
reddit$age.range <- factor(reddit$age.range, levels = c("Under 18", "18-24", "25-34", "35-44", "45-54", "55-64", "65 or Above"), ordered = T )

# reddit$age.range <- ordered(reddit$age.range, levels = c("Under 18", "18-24", "25-34", "35-44", "45-54", "55-64", "65 or Above"))

# Re-Arrange Income Range
reddit$income.range <- ordered(reddit$income.range, levels = c("Under $20,000", "$20,000 - $29,999", "$30,000 - $39,999", "$40,000 - $49,999" ,"$50,000 - $69,999",  "$70,000 - $99,999","$100,000 - $149,999","$150,000 or more"))


# Load Library and Plot age and income
library(ggplot2)
qplot(data = reddit, x = age.range)

qplot(data = reddit, x = income.range)