library(reticulate)
from sklearn.datasets import load_boston
boston_dataset = load_boston()
description = boston_dataset.DESCR
print(description)
# print(description[148:1225])
## .. _boston_dataset:
##
## Boston house prices dataset
## ---------------------------
##
## **Data Set Characteristics:**
##
## :Number of Instances: 506
##
## :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.
##
## :Attribute Information (in order):
## - CRIM per capita crime rate by town
## - ZN proportion of residential land zoned for lots over 25,000 sq.ft.
## - INDUS proportion of non-retail business acres per town
## - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
## - NOX nitric oxides concentration (parts per 10 million)
## - RM average number of rooms per dwelling
## - AGE proportion of owner-occupied units built prior to 1940
## - DIS weighted distances to five Boston employment centres
## - RAD index of accessibility to radial highways
## - TAX full-value property-tax rate per $10,000
## - PTRATIO pupil-teacher ratio by town
## - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
## - LSTAT % lower status of the population
## - MEDV Median value of owner-occupied homes in $1000's
##
## :Missing Attribute Values: None
##
## :Creator: Harrison, D. and Rubinfeld, D.L.
##
## This is a copy of UCI ML housing dataset.
## https://archive.ics.uci.edu/ml/machine-learning-databases/housing/
##
##
## This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.
##
## The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
## prices and the demand for clean air', J. Environ. Economics & Management,
## vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics
## ...', Wiley, 1980. N.B. Various transformations are used in the table on
## pages 244-261 of the latter.
##
## The Boston house-price data has been used in many machine learning papers that address regression
## problems.
##
## .. topic:: References
##
## - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
## - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
import pandas as pd
df = pd.DataFrame(boston_dataset.data,columns=boston_dataset.feature_names)
df.head()
## CRIM ZN INDUS CHAS NOX ... RAD TAX PTRATIO B LSTAT
## 0 0.00632 18.0 2.31 0.0 0.538 ... 1.0 296.0 15.3 396.90 4.98
## 1 0.02731 0.0 7.07 0.0 0.469 ... 2.0 242.0 17.8 396.90 9.14
## 2 0.02729 0.0 7.07 0.0 0.469 ... 2.0 242.0 17.8 392.83 4.03
## 3 0.03237 0.0 2.18 0.0 0.458 ... 3.0 222.0 18.7 394.63 2.94
## 4 0.06905 0.0 2.18 0.0 0.458 ... 3.0 222.0 18.7 396.90 5.33
##
## [5 rows x 13 columns]
df['MEDV'] = boston_dataset.target[df.index]
df.head()
## CRIM ZN INDUS CHAS NOX ... TAX PTRATIO B LSTAT MEDV
## 0 0.00632 18.0 2.31 0.0 0.538 ... 296.0 15.3 396.90 4.98 24.0
## 1 0.02731 0.0 7.07 0.0 0.469 ... 242.0 17.8 396.90 9.14 21.6
## 2 0.02729 0.0 7.07 0.0 0.469 ... 242.0 17.8 392.83 4.03 34.7
## 3 0.03237 0.0 2.18 0.0 0.458 ... 222.0 18.7 394.63 2.94 33.4
## 4 0.06905 0.0 2.18 0.0 0.458 ... 222.0 18.7 396.90 5.33 36.2
##
## [5 rows x 14 columns]
df.shape
## (506, 14)
df.describe()
## CRIM ZN INDUS ... B LSTAT MEDV
## count 506.000000 506.000000 506.000000 ... 506.000000 506.000000 506.000000
## mean 3.613524 11.363636 11.136779 ... 356.674032 12.653063 22.532806
## std 8.601545 23.322453 6.860353 ... 91.294864 7.141062 9.197104
## min 0.006320 0.000000 0.460000 ... 0.320000 1.730000 5.000000
## 25% 0.082045 0.000000 5.190000 ... 375.377500 6.950000 17.025000
## 50% 0.256510 0.000000 9.690000 ... 391.440000 11.360000 21.200000
## 75% 3.677083 12.500000 18.100000 ... 396.225000 16.955000 25.000000
## max 88.976200 100.000000 27.740000 ... 396.900000 37.970000 50.000000
##
## [8 rows x 14 columns]
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="ticks", color_codes=True)
#%matplotlib inline
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(),annot=True)
