Notes:
# Read in the Data
reddit <- read.csv('reddit.csv')
# Describe the Employment Status
table(reddit$employment.status)
##
## Employed full time
## 14814
## Freelance
## 1948
## Not employed and not looking for work
## 682
## Not employed, but looking for work
## 2087
## Retired
## 85
## Student
## 12987
# Summary
summary(reddit)
## id gender age.range
## Min. : 1 Min. :0.0000 18-24 :15802
## 1st Qu.: 8189 1st Qu.:0.0000 25-34 :11575
## Median :16380 Median :0.0000 Under 18: 2330
## Mean :16379 Mean :0.1885 35-44 : 2257
## 3rd Qu.:24568 3rd Qu.:0.0000 45-54 : 502
## Max. :32756 Max. :1.0000 (Other) : 200
## NA's :201 NA's : 88
## marital.status
## Engaged : 1109
## Forever Alone : 5850
## In a relationship : 9828
## Married/civil union/domestic partnership: 5490
## Single :10428
## Widowed : 44
## NA's : 5
## employment.status military.service
## Employed full time :14814 No :30526
## Freelance : 1948 Yes : 2223
## Not employed and not looking for work: 682 NA's: 5
## Not employed, but looking for work : 2087
## Retired : 85
## Student :12987
## NA's : 151
## children education
## No :27488 Bachelor's degree :11046
## Yes : 5047 Some college : 9600
## NA's: 219 Graduate or professional degree : 4722
## High school graduate or equivalent: 3272
## Some high school : 1924
## (Other) : 2046
## NA's : 144
## country state income.range
## United States :20967 :11908 Under $20,000 :7892
## Canada : 2888 California: 3401 $50,000 - $69,999 :4133
## United Kingdom: 1782 Texas : 1541 $70,000 - $99,999 :4101
## Australia : 1051 New York : 1418 $100,000 - $149,999:3522
## Germany : 407 Illinois : 976 $20,000 - $29,999 :3206
## (Other) : 5482 Washington: 910 (Other) :8285
## NA's : 177 (Other) :12600 NA's :1615
## fav.reddit dog.cat cheese
## : 4335 I like cats. :11156 Other :6563
## askreddit : 2123 I like dogs. :17151 Cheddar :6102
## fffffffuuuuuuuuuuuu: 1746 I like turtles.: 4442 Brie :3742
## pics : 1651 NA's : 5 Provolone:3456
## trees : 1311 Swiss :3214
## (Other) :21562 (Other) :9672
## NA's : 26 NA's : 5
# Print Age Range Data
# reddit$age.range
# Interrogate the Levels for age and income
levels(reddit$age.range)
## [1] "18-24" "25-34" "35-44" "45-54" "55-64"
## [6] "65 or Above" "Under 18"
levels(reddit$income.range)
## [1] "$100,000 - $149,999" "$150,000 or more" "$20,000 - $29,999"
## [4] "$30,000 - $39,999" "$40,000 - $49,999" "$50,000 - $69,999"
## [7] "$70,000 - $99,999" "Under $20,000"
# Spot Check Factor and Numberic tests on age range
is.factor(reddit$age.range)
## [1] TRUE
is.numeric(reddit$age.range)
## [1] FALSE
# Show structure of data set
str(reddit)
## 'data.frame': 32754 obs. of 14 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : int 0 0 1 0 1 0 0 0 0 0 ...
## $ age.range : Factor w/ 7 levels "18-24","25-34",..: 2 2 1 2 2 2 2 1 3 2 ...
## $ marital.status : Factor w/ 6 levels "Engaged","Forever Alone",..: NA NA NA NA NA 4 3 4 4 3 ...
## $ employment.status: Factor w/ 6 levels "Employed full time",..: 1 1 2 2 1 1 1 4 1 2 ...
## $ military.service : Factor w/ 2 levels "No","Yes": NA NA NA NA NA 1 1 1 1 1 ...
## $ children : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ education : Factor w/ 7 levels "Associate degree",..: 2 2 5 2 2 2 5 2 2 5 ...
## $ country : Factor w/ 439 levels " Canada"," Canada eh",..: 394 394 394 394 394 394 125 394 394 125 ...
## $ state : Factor w/ 53 levels "","Alabama","Alaska",..: 33 33 48 33 6 33 1 6 33 1 ...
## $ income.range : Factor w/ 8 levels "$100,000 - $149,999",..: 2 2 8 2 7 2 NA 7 2 7 ...
## $ fav.reddit : Factor w/ 1834 levels "","'home' page (or front page if you prefer)",..: 720 691 1511 1528 188 691 1318 571 1629 1 ...
## $ dog.cat : Factor w/ 3 levels "I like cats.",..: NA NA NA NA NA 2 2 2 1 1 ...
## $ cheese : Factor w/ 11 levels "American","Brie",..: NA NA NA NA NA 3 3 1 10 7 ...
# Re-Arrange Age Range
# mydf$task <- factor(mydf$task, levels = c("up", "down", "left", "right", "front", "back"))
reddit$age.range <- factor(reddit$age.range, levels = c("Under 18", "18-24", "25-34", "35-44", "45-54", "55-64", "65 or Above"), ordered = T )
# reddit$age.range <- ordered(reddit$age.range, levels = c("Under 18", "18-24", "25-34", "35-44", "45-54", "55-64", "65 or Above"))
# Re-Arrange Income Range
reddit$income.range <- ordered(reddit$income.range, levels = c("Under $20,000", "$20,000 - $29,999", "$30,000 - $39,999", "$40,000 - $49,999" ,"$50,000 - $69,999", "$70,000 - $99,999","$100,000 - $149,999","$150,000 or more"))
# Load Library and Plot age and income
library(ggplot2)
qplot(data = reddit, x = age.range)
qplot(data = reddit, x = income.range)