Welcome to week 2! This week we learned how to import and export data and how to pull content from the web. In this R Markdown, I will show you how to
We will be using 3 different datasets and preview them here. The datasets are
library(readxl) #used to read .xlsx files from your device
library(gdata) #used to pull in .xlsx files addressed as a URL
library(XML) #used to get the HTML links from a website
library(stringr) #used to find a string with a keyword
In the homework problems below, the intent is listed, the code and result are printed. Following the code execution, head() and str() are used to show the structure and first few rows of the dataframe.
#import downloaded csv
reddit_df <- read.csv("reddit.csv", stringsAsFactors = FALSE)
head(reddit_df)
## id gender age.range marital.status
## 1 1 0 25-34 <NA>
## 2 2 0 25-34 <NA>
## 3 3 1 18-24 <NA>
## 4 4 0 25-34 <NA>
## 5 5 1 25-34 <NA>
## 6 6 0 25-34 Married/civil union/domestic partnership
## employment.status military.service children education
## 1 Employed full time <NA> No Bachelor's degree
## 2 Employed full time <NA> No Bachelor's degree
## 3 Freelance <NA> No Some college
## 4 Freelance <NA> No Bachelor's degree
## 5 Employed full time <NA> No Bachelor's degree
## 6 Employed full time No No Bachelor's degree
## country state income.range fav.reddit dog.cat
## 1 United States New York $150,000 or more getmotivated <NA>
## 2 United States New York $150,000 or more gaming <NA>
## 3 United States Virginia Under $20,000 snackexchange <NA>
## 4 United States New York $150,000 or more spacedicks <NA>
## 5 United States California $70,000 - $99,999 aww <NA>
## 6 United States New York $150,000 or more gaming I like dogs.
## cheese
## 1 <NA>
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 Cheddar
str(reddit_df)
## 'data.frame': 32754 obs. of 14 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : int 0 0 1 0 1 0 0 0 0 0 ...
## $ age.range : chr "25-34" "25-34" "18-24" "25-34" ...
## $ marital.status : chr NA NA NA NA ...
## $ employment.status: chr "Employed full time" "Employed full time" "Freelance" "Freelance" ...
## $ military.service : chr NA NA NA NA ...
## $ children : chr "No" "No" "No" "No" ...
## $ education : chr "Bachelor's degree" "Bachelor's degree" "Some college" "Bachelor's degree" ...
## $ country : chr "United States" "United States" "United States" "United States" ...
## $ state : chr "New York" "New York" "Virginia" "New York" ...
## $ income.range : chr "$150,000 or more" "$150,000 or more" "Under $20,000" "$150,000 or more" ...
## $ fav.reddit : chr "getmotivated" "gaming" "snackexchange" "spacedicks" ...
## $ dog.cat : chr NA NA NA NA ...
## $ cheese : chr NA NA NA NA ...
#import csv from url
url <- "https://bradleyboehmke.github.io/public/data/reddit.csv"
reddit_df1 <- read.csv(url, stringsAsFactors = FALSE)
head(reddit_df1)
## id gender age.range marital.status
## 1 1 0 25-34 <NA>
## 2 2 0 25-34 <NA>
## 3 3 1 18-24 <NA>
## 4 4 0 25-34 <NA>
## 5 5 1 25-34 <NA>
## 6 6 0 25-34 Married/civil union/domestic partnership
## employment.status military.service children education
## 1 Employed full time <NA> No Bachelor's degree
## 2 Employed full time <NA> No Bachelor's degree
## 3 Freelance <NA> No Some college
## 4 Freelance <NA> No Bachelor's degree
## 5 Employed full time <NA> No Bachelor's degree
## 6 Employed full time No No Bachelor's degree
## country state income.range fav.reddit dog.cat
## 1 United States New York $150,000 or more getmotivated <NA>
## 2 United States New York $150,000 or more gaming <NA>
## 3 United States Virginia Under $20,000 snackexchange <NA>
## 4 United States New York $150,000 or more spacedicks <NA>
## 5 United States California $70,000 - $99,999 aww <NA>
## 6 United States New York $150,000 or more gaming I like dogs.
## cheese
## 1 <NA>
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 Cheddar
str(reddit_df1)
## 'data.frame': 32754 obs. of 14 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : int 0 0 1 0 1 0 0 0 0 0 ...
## $ age.range : chr "25-34" "25-34" "18-24" "25-34" ...
## $ marital.status : chr NA NA NA NA ...
## $ employment.status: chr "Employed full time" "Employed full time" "Freelance" "Freelance" ...
## $ military.service : chr NA NA NA NA ...
## $ children : chr "No" "No" "No" "No" ...
## $ education : chr "Bachelor's degree" "Bachelor's degree" "Some college" "Bachelor's degree" ...
## $ country : chr "United States" "United States" "United States" "United States" ...
## $ state : chr "New York" "New York" "Virginia" "New York" ...
## $ income.range : chr "$150,000 or more" "$150,000 or more" "Under $20,000" "$150,000 or more" ...
## $ fav.reddit : chr "getmotivated" "gaming" "snackexchange" "spacedicks" ...
## $ dog.cat : chr NA NA NA NA ...
## $ cheese : chr NA NA NA NA ...
#import downloaded .xlsx FMR data
fmr_df <- read_excel("FY2017_4050_FMR.xlsx")
head(fmr_df)
## fips2010 fips2000 fmr2 fmr0 fmr1 fmr3 fmr4 State Metro_code
## 1 2300512300 <NA> 1078 755 851 1454 1579 23 METRO38860MM6400
## 2 6099999999 <NA> 677 502 506 987 1038 60 NCNTY60999N60999
## 3 6999999999 <NA> 666 411 498 961 1158 69 NCNTY69999N69999
## 4 0100199999 0100199999 822 587 682 1054 1425 1 METRO33860M33860
## 5 0100399999 0100399999 977 807 847 1422 1634 1 METRO19300M19300
## 6 0100599999 0100599999 671 501 505 839 958 1 NCNTY01005N01005
## areaname county CouSub countyname
## 1 Portland, ME HUD Metro FMR Area NA 12300 Cumberland County
## 2 American Samoa 999 99999 American Samoa
## 3 Northern Mariana Islands 999 99999 Northern Mariana Islands
## 4 Montgomery, AL MSA 1 99999 Autauga County
## 5 Daphne-Fairhope-Foley, AL MSA 3 99999 Baldwin County
## 6 Barbour County, AL 5 99999 Barbour County
## county_town_name pop2010 acs_2016_2 state_alpha fmr_type metro
## 1 Chebeague Island town 341 1109 ME 40 1
## 2 American Samoa 55519 653 AS 40 0
## 3 Northern Mariana Islands 53883 642 MP 40 0
## 4 Autauga County 54571 788 AL 40 1
## 5 Baldwin County 182265 873 AL 40 1
## 6 Barbour County 27457 636 AL 40 0
## FMR_PCT_Change FMR_Dollar_Change
## 1 0.9720469 -31
## 2 1.0367534 24
## 3 1.0373832 24
## 4 1.0431472 34
## 5 1.1191294 104
## 6 1.0550314 35
str(fmr_df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 4769 obs. of 21 variables:
## $ fips2010 : chr "2300512300" "6099999999" "6999999999" "0100199999" ...
## $ fips2000 : chr NA NA NA "0100199999" ...
## $ fmr2 : num 1078 677 666 822 977 ...
## $ fmr0 : num 755 502 411 587 807 501 665 665 491 464 ...
## $ fmr1 : num 851 506 498 682 847 505 751 751 494 467 ...
## $ fmr3 : num 1454 987 961 1054 1422 ...
## $ fmr4 : num 1579 1038 1158 1425 1634 ...
## $ State : num 23 60 69 1 1 1 1 1 1 1 ...
## $ Metro_code : chr "METRO38860MM6400" "NCNTY60999N60999" "NCNTY69999N69999" "METRO33860M33860" ...
## $ areaname : chr "Portland, ME HUD Metro FMR Area" "American Samoa" "Northern Mariana Islands" "Montgomery, AL MSA" ...
## $ county : num NA 999 999 1 3 5 7 9 11 13 ...
## $ CouSub : chr "12300" "99999" "99999" "99999" ...
## $ countyname : chr "Cumberland County" "American Samoa" "Northern Mariana Islands" "Autauga County" ...
## $ county_town_name : chr "Chebeague Island town" "American Samoa" "Northern Mariana Islands" "Autauga County" ...
## $ pop2010 : num 341 55519 53883 54571 182265 ...
## $ acs_2016_2 : num 1109 653 642 788 873 ...
## $ state_alpha : chr "ME" "AS" "MP" "AL" ...
## $ fmr_type : num 40 40 40 40 40 40 40 40 40 40 ...
## $ metro : num 1 0 0 1 1 0 1 1 0 0 ...
## $ FMR_PCT_Change : num 0.972 1.037 1.037 1.043 1.119 ...
## $ FMR_Dollar_Change: num -31 24 24 34 104 35 26 26 52 52 ...
#import .xlsx FMR data from url
url2 <- "http://www.huduser.gov/portal/datasets/fmr/fmr2017/FY2017_4050_FMR.xlsx"
fmr_df2 <- read.xls(url2)
head(fmr_df2)
## fips2010 fips2000 fmr2 fmr0 fmr1 fmr3 fmr4 State Metro_code
## 1 2300512300 NA 1078 755 851 1454 1579 23 METRO38860MM6400
## 2 6099999999 NA 677 502 506 987 1038 60 NCNTY60999N60999
## 3 6999999999 NA 666 411 498 961 1158 69 NCNTY69999N69999
## 4 100199999 100199999 822 587 682 1054 1425 1 METRO33860M33860
## 5 100399999 100399999 977 807 847 1422 1634 1 METRO19300M19300
## 6 100599999 100599999 671 501 505 839 958 1 NCNTY01005N01005
## areaname county CouSub countyname
## 1 Portland, ME HUD Metro FMR Area NA 12300 Cumberland County
## 2 American Samoa 999 99999 American Samoa
## 3 Northern Mariana Islands 999 99999 Northern Mariana Islands
## 4 Montgomery, AL MSA 1 99999 Autauga County
## 5 Daphne-Fairhope-Foley, AL MSA 3 99999 Baldwin County
## 6 Barbour County, AL 5 99999 Barbour County
## county_town_name pop2010 acs_2016_2 state_alpha fmr_type metro
## 1 Chebeague Island town 341 1109 ME 40 1
## 2 American Samoa 55519 653 AS 40 0
## 3 Northern Mariana Islands 53883 642 MP 40 0
## 4 Autauga County 54571 788 AL 40 1
## 5 Baldwin County 182265 873 AL 40 1
## 6 Barbour County 27457 636 AL 40 0
## FMR_PCT_Change FMR_Dollar_Change
## 1 0.9720469 -31
## 2 1.0367534 24
## 3 1.0373832 24
## 4 1.0431472 34
## 5 1.1191294 104
## 6 1.0550314 35
str(fmr_df2)
## 'data.frame': 4769 obs. of 21 variables:
## $ fips2010 : num 2.3e+09 6.1e+09 7.0e+09 1.0e+08 1.0e+08 ...
## $ fips2000 : num NA NA NA 1e+08 1e+08 ...
## $ fmr2 : int 1078 677 666 822 977 671 866 866 621 621 ...
## $ fmr0 : int 755 502 411 587 807 501 665 665 491 464 ...
## $ fmr1 : int 851 506 498 682 847 505 751 751 494 467 ...
## $ fmr3 : int 1454 987 961 1054 1422 839 1163 1163 853 849 ...
## $ fmr4 : int 1579 1038 1158 1425 1634 958 1298 1298 856 1094 ...
## $ State : int 23 60 69 1 1 1 1 1 1 1 ...
## $ Metro_code : Factor w/ 2598 levels "METRO10180M10180",..: 451 2592 2594 384 160 625 55 55 626 627 ...
## $ areaname : Factor w/ 2598 levels " Santa Ana-Anaheim-Irvine, CA HUD Metro FMR Area",..: 1903 52 1723 1633 571 122 186 186 263 271 ...
## $ county : int NA 999 999 1 3 5 7 9 11 13 ...
## $ CouSub : int 12300 99999 99999 99999 99999 99999 99999 99999 99999 99999 ...
## $ countyname : Factor w/ 1961 levels "A\xf1asco Municipio",..: 462 42 1265 92 99 110 163 178 239 249 ...
## $ county_town_name : Factor w/ 3175 levels "A\xf1asco Municipio",..: 533 61 2024 136 149 165 254 277 386 401 ...
## $ pop2010 : int 341 55519 53883 54571 182265 27457 22915 57322 10914 20947 ...
## $ acs_2016_2 : int 1109 653 642 788 873 636 840 840 569 569 ...
## $ state_alpha : Factor w/ 56 levels "AK","AL","AR",..: 24 4 28 2 2 2 2 2 2 2 ...
## $ fmr_type : int 40 40 40 40 40 40 40 40 40 40 ...
## $ metro : int 1 0 0 1 1 0 1 1 0 0 ...
## $ FMR_PCT_Change : num 0.972 1.037 1.037 1.043 1.119 ...
## $ FMR_Dollar_Change: int -31 24 24 34 104 35 26 26 52 52 ...
#import .txt file from url
url3 <- "http://academic.udayton.edu/kissock/http/Weather/citylistUS.htm"
url3mod <- "http://academic.udayton.edu/kissock/http/Weather/"
links <- getHTMLLinks(url3)
links_data <- links[str_detect(links, "CIN")]
filenames <- paste0(url3mod, str_sub(links_data, start = regexpr("gsod", links_data)))
filenames
## [1] "http://academic.udayton.edu/kissock/http/Weather/gsod95-current/OHCINCIN.txt"
CIN <- read.table(filenames)
head(CIN)
## V1 V2 V3 V4
## 1 1 1 1995 41.1
## 2 1 2 1995 22.2
## 3 1 3 1995 22.8
## 4 1 4 1995 14.9
## 5 1 5 1995 9.5
## 6 1 6 1995 23.8
str(CIN)
## 'data.frame': 7963 obs. of 4 variables:
## $ V1: int 1 1 1 1 1 1 1 1 1 1 ...
## $ V2: int 1 2 3 4 5 6 7 8 9 10 ...
## $ V3: int 1995 1995 1995 1995 1995 1995 1995 1995 1995 1995 ...
## $ V4: num 41.1 22.2 22.8 14.9 9.5 23.8 31.1 26.9 31.3 31.5 ...