library(reticulate)
from sklearn.datasets import load_boston
boston_dataset = load_boston()
description = boston_dataset.DESCR
print(description)
# print(description[148:1225])
## .. _boston_dataset:
## 
## Boston house prices dataset
## ---------------------------
## 
## **Data Set Characteristics:**  
## 
##     :Number of Instances: 506 
## 
##     :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.
## 
##     :Attribute Information (in order):
##         - CRIM     per capita crime rate by town
##         - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
##         - INDUS    proportion of non-retail business acres per town
##         - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
##         - NOX      nitric oxides concentration (parts per 10 million)
##         - RM       average number of rooms per dwelling
##         - AGE      proportion of owner-occupied units built prior to 1940
##         - DIS      weighted distances to five Boston employment centres
##         - RAD      index of accessibility to radial highways
##         - TAX      full-value property-tax rate per $10,000
##         - PTRATIO  pupil-teacher ratio by town
##         - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
##         - LSTAT    % lower status of the population
##         - MEDV     Median value of owner-occupied homes in $1000's
## 
##     :Missing Attribute Values: None
## 
##     :Creator: Harrison, D. and Rubinfeld, D.L.
## 
## This is a copy of UCI ML housing dataset.
## https://archive.ics.uci.edu/ml/machine-learning-databases/housing/
## 
## 
## This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.
## 
## The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
## prices and the demand for clean air', J. Environ. Economics & Management,
## vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
## ...', Wiley, 1980.   N.B. Various transformations are used in the table on
## pages 244-261 of the latter.
## 
## The Boston house-price data has been used in many machine learning papers that address regression
## problems.   
##      
## .. topic:: References
## 
##    - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
##    - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
import pandas as pd
df = pd.DataFrame(boston_dataset.data,columns=boston_dataset.feature_names)
df.head()
##       CRIM    ZN  INDUS  CHAS    NOX  ...  RAD    TAX  PTRATIO       B  LSTAT
## 0  0.00632  18.0   2.31   0.0  0.538  ...  1.0  296.0     15.3  396.90   4.98
## 1  0.02731   0.0   7.07   0.0  0.469  ...  2.0  242.0     17.8  396.90   9.14
## 2  0.02729   0.0   7.07   0.0  0.469  ...  2.0  242.0     17.8  392.83   4.03
## 3  0.03237   0.0   2.18   0.0  0.458  ...  3.0  222.0     18.7  394.63   2.94
## 4  0.06905   0.0   2.18   0.0  0.458  ...  3.0  222.0     18.7  396.90   5.33
## 
## [5 rows x 13 columns]
df['MEDV'] = boston_dataset.target[df.index]
df.head()
##       CRIM    ZN  INDUS  CHAS    NOX  ...    TAX  PTRATIO       B  LSTAT  MEDV
## 0  0.00632  18.0   2.31   0.0  0.538  ...  296.0     15.3  396.90   4.98  24.0
## 1  0.02731   0.0   7.07   0.0  0.469  ...  242.0     17.8  396.90   9.14  21.6
## 2  0.02729   0.0   7.07   0.0  0.469  ...  242.0     17.8  392.83   4.03  34.7
## 3  0.03237   0.0   2.18   0.0  0.458  ...  222.0     18.7  394.63   2.94  33.4
## 4  0.06905   0.0   2.18   0.0  0.458  ...  222.0     18.7  396.90   5.33  36.2
## 
## [5 rows x 14 columns]
df.shape
## (506, 14)
df.describe()
##              CRIM          ZN       INDUS  ...           B       LSTAT        MEDV
## count  506.000000  506.000000  506.000000  ...  506.000000  506.000000  506.000000
## mean     3.613524   11.363636   11.136779  ...  356.674032   12.653063   22.532806
## std      8.601545   23.322453    6.860353  ...   91.294864    7.141062    9.197104
## min      0.006320    0.000000    0.460000  ...    0.320000    1.730000    5.000000
## 25%      0.082045    0.000000    5.190000  ...  375.377500    6.950000   17.025000
## 50%      0.256510    0.000000    9.690000  ...  391.440000   11.360000   21.200000
## 75%      3.677083   12.500000   18.100000  ...  396.225000   16.955000   25.000000
## max     88.976200  100.000000   27.740000  ...  396.900000   37.970000   50.000000
## 
## [8 rows x 14 columns]
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="ticks", color_codes=True)
#%matplotlib inline
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(),annot=True)