Initial Analysis

S. Jackson Kelley

A01281942

Reading in the data

Here we read in data exported from the mongoDB database running on the AWS server. Outcomes contains only the itemID, categoryID/Name, and final selling price. forSale contains any features I thought might be of use to a machine learning algorithm. This includes things like the item’s title string as well as location.

outcomes = read.csv("completedItems_2.csv",header=TRUE)
forSale = read.csv("forSale_2.csv",header=TRUE)
head(outcomes)
##           X_id categoryId        categoryName sellingPrice
## 1 112185507879     139971 Video Game Consoles       349.99
## 2 122202585486     139971 Video Game Consoles       305.00
## 3 291926308477     139971 Video Game Consoles       200.98
## 4 262695233466     139971 Video Game Consoles       280.00
## 5 142163675662     139971 Video Game Consoles       220.00
## 6 262695805629     139971 Video Game Consoles       250.00
head(forSale)
##           X_id
## 1 172388678529
## 2 122194533957
## 3 282229141811
## 4 282233993583
## 5 162252708291
## 6 131981426207
##                                                                           title
## 1                  NEW PLAYSTATION 4 LIMITED EDITION TACO BELL GOLD CONSOLE PS4
## 2 SONY PLAYSTATION 4 (CUH-1215A) 500GB DESTINY THE TAKEN KING LIMITED EDITION  
## 3                                         SONY PLAYSTATION 4 SYSTEM  500 GIG HD
## 4                                   SONY PLAYSTATION 4 500 GB JET BLACK CONSOLE
## 5                    UNCHARTED 4: A THIEF'S END (SONY PLAYSTATION 4 SLIM, 2016)
## 6                                                            SONY PLAYSTATION 4
##   currentPrice shippingCost calculateShipping totalPrice
## 1       650.00        25.00             false     675.00
## 2       250.00        10.00             false     260.00
## 3       239.99        19.99             false     259.98
## 4       200.00        30.00             false     230.00
## 5       237.50        28.00             false     265.50
## 6       132.50        14.00             false     146.50
##                dateQueried                  endDate               location
## 1 2016-10-29T07:30:02.399Z 2016-10-30T02:35:15.000Z       Las Vegas,NV,USA
## 2 2016-10-29T07:30:02.423Z 2016-10-29T15:18:15.000Z      Montgomery,AL,USA
## 3 2016-10-29T07:30:03.615Z 2016-10-29T17:05:05.000Z         Ballwin,MO,USA
## 4 2016-10-29T07:30:03.620Z 2016-10-29T18:57:11.000Z        Trinidad,CO,USA
## 5 2016-10-29T07:30:03.623Z 2016-10-29T16:49:40.000Z West Palm Beach,FL,USA
## 6 2016-10-29T07:30:03.629Z 2016-10-29T21:54:15.000Z       Watertown,CT,USA
##   country bidCount listingType bestOffer buyItNowAvailable conditionId
## 1      US      NaN  FixedPrice     false             false        1000
## 2      US        1     Auction     false             false        3000
## 3      US        0     Auction     false             false        3000
## 4      US        0     Auction     false             false        3000
## 5      US       41     Auction     false             false        1500
## 6      US       20     Auction     false             false        3000
##      conditionDisplayName
## 1                     New
## 2                    Used
## 3                    Used
## 4                    Used
## 5 New other (see details)
## 6                    Used
##                                                                                timeLeft
## 1  {"milliseconds":0,"seconds":32,"minutes":3,"hours":19,"days":0,"months":0,"years":0}
## 2  {"milliseconds":0,"seconds":32,"minutes":46,"hours":7,"days":0,"months":0,"years":0}
## 3  {"milliseconds":0,"seconds":20,"minutes":33,"hours":9,"days":0,"months":0,"years":0}
## 4 {"milliseconds":0,"seconds":26,"minutes":25,"hours":11,"days":0,"months":0,"years":0}
## 5  {"milliseconds":0,"seconds":55,"minutes":17,"hours":9,"days":0,"months":0,"years":0}
## 6 {"milliseconds":0,"seconds":30,"minutes":22,"hours":14,"days":0,"months":0,"years":0}

Above is shown an overview of every feature collected from the eBay API in the forSale dataframe. A few confusing one’s will be noted for the purposes of this presentation.

Pre-processing

I will perform the imputation of shipping costs now. First we change the -1 values to NA’s, and then impute NA’s with the average shipping cost for non NA items.

forSale$shippingCost[forSale$calculateShipping == "true"] = NA
forSale$shippingCost = with (forSale,impute(shippingCost,mean))
forSale$bidCount     = with(forSale, impute(bidCount,mean))
forSale$bidCount = as.numeric(forSale$bidCount)

Calculating imputed totalPrice and removing sold items that aren’t video game consoles.

forSale$totalPrice = forSale$currentPrice + forSale$shippingCost
forSale$totalPrice = as.numeric(forSale$totalPrice)
outcomes = outcomes[outcomes$categoryName == "Video Game Consoles",]

Dates

Here I’m converting date strings to actual R dates.

dates = as.Date(forSale$endDate)
forSale$dayOfWeek = as.factor(weekdays(dates))

Outliers

We can use R’s built in boxplot code to detect and remove outliers from the dataset.

outliers = boxplot.stats(forSale$totalPrice)$out
forSale = forSale[!forSale$totalPrice %in% outliers,]
forSale = forSale[(forSale$totalPrice > 100),]

Summary

Here we summarize the data to see if anything stands out as interesting in the data for further analysis or visualization.

forSale Summary

It looks like surprisingly there are a higher number of Playstation 4’s with the title “SONY PLAYSTATION 4 BASIC SET 500 GB BLACK CONSOLE” and “SONY PLAYSTATION 4 (LATEST MODEL)- 500 GB BLACK CONSOLE” in the title. This may be due to the fact that many people are buying and relisting with the same title, similar to my software. We also print out the SD for all playstation 4’s in order to get a better idea of how variable the prices are.

summary(forSale)
## 
##  739 values imputed to 10.05239
##       X_id          
##  Min.   :1.120e+11  
##  1st Qu.:1.623e+11  
##  Median :2.223e+11  
##  Mean   :2.271e+11  
##  3rd Qu.:2.822e+11  
##  Max.   :4.012e+11  
##                     
##                                                                               title     
##  SONY PLAYSTATION 4 BASIC SET 500 GB BLACK CONSOLE                               : 212  
##  SONY PLAYSTATION 4 (LATEST MODEL)- 500 GB BLACK CONSOLE                         :  67  
##  SONY PLAYSTATION 4 500 GB BLACK CONSOLE                                         :  58  
##  SONY PLAYSTATION 4 CALL OF DUTY: BLACK OPS III - STANDARD EDITION 500 GB JET... :  42  
##  PLAYSTATION 4                                                                   :  24  
##  SONY PLAYSTATION 4 CALL OF DUTY: BLACK OPS III - STANDARD EDITION 500 GB JET BL…:  20  
##  (Other)                                                                         :1216  
##   currentPrice    shippingCost   calculateShipping   totalPrice   
##  Min.   : 80.0   Min.   : 0.00   false:900         Min.   :102.0  
##  1st Qu.:177.5   1st Qu.: 0.00   true :739         1st Qu.:187.5  
##  Median :202.5   Median :10.05                     Median :215.5  
##  Mean   :225.8   Mean   :10.21                     Mean   :236.0  
##  3rd Qu.:255.0   3rd Qu.:12.95                     3rd Qu.:269.0  
##  Max.   :469.9   Max.   :50.00                     Max.   :469.9  
##                                                                   
##                    dateQueried                      endDate    
##                          :739   2016-11-02T17:06:29.000Z:   2  
##  2016-10-29T07:30:02.423Z:  1   2016-11-04T01:00:08.000Z:   2  
##  2016-10-29T07:30:03.615Z:  1   2016-11-12T03:00:07.000Z:   2  
##  2016-10-29T07:30:03.620Z:  1   2016-11-14T16:27:11.000Z:   2  
##  2016-10-29T07:30:03.623Z:  1   2016-11-15T15:30:23.000Z:   2  
##  2016-10-29T07:30:03.629Z:  1   2016-11-22T00:59:57.000Z:   2  
##  (Other)                 :895   (Other)                 :1627  
##                location       country        bidCount    
##  USA               :  38   US     :1585   Min.   : 0.00  
##  Canada            :  28   CA     :  28   1st Qu.: 0.00  
##  Columbus,OH,USA   :  23   GB     :   5   Median : 8.00  
##  Miami,FL,USA      :  21   KR     :   5   Mean   :11.36  
##  Los Angeles,CA,USA:  20   JP     :   4   3rd Qu.:12.00  
##  New York,NY,USA   :  18   RU     :   3   Max.   :82.00  
##  (Other)           :1491   (Other):   9                  
##          listingType   bestOffer    buyItNowAvailable  conditionId  
##  Auction       :1159   false:1512   false:1480        Min.   :1000  
##  AuctionWithBIN: 159   true : 127   true : 159        1st Qu.:3000  
##  FixedPrice    : 204                                  Median :3000  
##  StoreInventory: 117                                  Mean   :2629  
##                                                       3rd Qu.:3000  
##                                                       Max.   :3000  
##                                                                     
##                conditionDisplayName
##  Used                    :1280     
##  New                     : 219     
##  New other (see details) :  93     
##  Seller refurbished      :  34     
##  Manufacturer refurbished:  13     
##  Brand New               :   0     
##  (Other)                 :   0     
##                                                                                   timeLeft   
##  {"milliseconds":0,"seconds":14,"minutes":28,"hours":18,"days":0,"months":0,"years":0}:   3  
##  {"milliseconds":0,"seconds":0,"minutes":43,"hours":18,"days":0,"months":0,"years":0} :   2  
##  {"milliseconds":0,"seconds":10,"minutes":4,"hours":16,"days":0,"months":0,"years":0} :   2  
##  {"milliseconds":0,"seconds":13,"minutes":28,"hours":18,"days":0,"months":0,"years":0}:   2  
##  {"milliseconds":0,"seconds":16,"minutes":33,"hours":20,"days":0,"months":0,"years":0}:   2  
##  {"milliseconds":0,"seconds":22,"minutes":28,"hours":10,"days":0,"months":0,"years":0}:   2  
##  (Other)                                                                              :1626  
##      dayOfWeek  
##  Friday   :225  
##  Monday   :300  
##  Saturday :223  
##  Sunday   :266  
##  Thursday :203  
##  Tuesday  :248  
##  Wednesday:174
sd(forSale$totalPrice)
## [1] 68.25486

Initial Questions

  • Does a higher bidCount the day before the end date result in a higher price?

  • Does buyItNow effect price?

  • Does the text in the item title effect price? (appears to be significant after totalCost)

  • Does day of the week effect price?

Outcomes Summary

Really there isn’t much to this data as it’s our outcome we’re trying to predict. We do see that the average price of all Playstation 4’s is 267USD.

It appears that the SD’s of the two datasets are similar, which gives me some comfort in the data gathering methods.

summary(outcomes)
##       X_id             categoryId                            categoryName 
##  Min.   :1.121e+11   Min.   :139971   Video Game Consoles          :7797  
##  1st Qu.:1.623e+11   1st Qu.:139971   Cell Phones & Smartphones    :   0  
##  Median :2.321e+11   Median :139971   Consoles de jeux vidéo       :   0  
##  Mean   :2.297e+11   Mean   :139971   Console Systems              :   0  
##  3rd Qu.:2.822e+11   3rd Qu.:139971   DJ Controllers               :   0  
##  Max.   :4.012e+11   Max.   :139971   Faceplates, Decals & Stickers:   0  
##                                       (Other)                      :   0  
##   sellingPrice  
##  Min.   :120.0  
##  1st Qu.:224.9  
##  Median :247.5  
##  Mean   :265.0  
##  3rd Qu.:285.0  
##  Max.   :592.0  
## 
sd(outcomes$sellingPrice)
## [1] 70.30095

Visualization

I’d first like to see the histogram of these items and see just how variable their prices are.

Histogram of Price Counts for Sold Playstation4s

library(ggplot2)
ggplot(outcomes, aes(sellingPrice, fill = categoryName)) +
  geom_histogram(binwidth = 5)

###Histogram of Price Counts for Playstation4s currently for sale

ggplot(forSale, aes(totalPrice, fill = listingType)) +
  geom_histogram(binwidth = 5)

These data both appear to be positively skewed.

Plotting the location of sold Playstation4s

The map plot shows a high number of Playstation 4’s are sold in the North East of the US and around the Los Angeles area. This is probably simple due to the higher populations in these areas.

library(ggmap)
map1 = ggmap(get_map(location = "United States",zoom=4))+geom_point(data=locations, aes(x=lon,y=lat),color="orange")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=United+States&zoom=4&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=United%20States&sensor=false
map1
## Warning: Removed 16 rows containing missing values (geom_point).