!pip install kaggle


import webbrowser
import ipywidgets as widgets
from IPython.display import display

def open_kaggle_website(url):
    kaggle_url = "https://www.kaggle.com/"
    full_url = kaggle_url + url.strip("/")
    webbrowser.open_new_tab(full_url)

# Create an input widget for entering the Kaggle URL
url_input = widgets.Text(
    placeholder="",
    description="Kaggle URL:",
    layout=widgets.Layout(width="500px")
)

# Create a button widget for triggering the website opening
open_button = widgets.Button(description="Open Kaggle Website")

def on_button_clicked(b):
    open_kaggle_website(url_input.value)

open_button.on_click(on_button_clicked)

# Display the widgets
display(url_input, open_button)


from IPython.display import IFrame
kaggle = IFrame(src = "https://www.kaggle.com", width =1000 , height = 500)
display(kaggle)


import json
from pathlib import Path
import requests

# Load your Kaggle API credentials from the JSON file
api_credentials_path = Path("path_to_your_api_credentials.json")
with open(api_credentials_path, 'r') as f:
    api_credentials = json.load(f)

# Set your Kaggle API credentials
username = api_credentials["username"]
api_key = api_credentials["key"]

# Specify the dataset you want to download (replace with the dataset slug)
dataset_slug = "your_dataset_slug"

# Specify the directory where you want to save the downloaded dataset
download_dir = Path("path_to_download_directory")

# Construct the URL to download the dataset
download_url = f"https://www.kaggle.com/{username}/{dataset_slug}/download"

# Set the Kaggle API credentials
kaggle_auth = (username, api_key)

# Download the dataset using requests
response = requests.get(download_url, auth=kaggle_auth)

# Save the downloaded dataset to the specified directory
downloaded_file_path = download_dir / f"{dataset_slug}.zip"
with open(downloaded_file_path, 'wb') as f:
    f.write(response.content)

print("Dataset downloaded and saved:", downloaded_file_path)


import numpy as np  # Library for numerical computations
import pandas as pd  # Library for data manipulation and analysis
import seaborn as sns  # Library for statistical data visualization
import matplotlib.pyplot as plt  # Library for creating plots and visualizations


# Import the necessary Plotly libraries
import plotly.graph_objects as go  # Low-level interface for creating Plotly plots
import plotly.express as px  # Higher-level interface for creating interactive plots


# Import the necessary ipywidgets libraries
import ipywidgets as widgets  # Library for creating interactive widgets
from ipywidgets import interact, interact_manual  # Functions for creating interactive controls


# to delete warnings
import warnings
warnings.filterwarnings("ignore")


from sklearn.model_selection import train_test_split, cross_val_score  # Functions for splitting data into train and test sets, and performing cross-validation
from sklearn.preprocessing import StandardScaler  # Class for standardizing features by removing the mean and scaling to unit variance
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error  # Evaluation metrics for regression models


from sklearn.linear_model import LinearRegression  # Linear regression model
from sklearn.linear_model import Ridge  # Ridge regression model
from sklearn.linear_model import Lasso  # Lasso regression model
from sklearn.linear_model import ElasticNet  # ElasticNet regression model
from sklearn.ensemble import RandomForestRegressor  # Random Forest regression model
from sklearn.svm import SVR  # Support Vector Regression model
from xgboost import XGBRegressor  # XGBoost regression model
from sklearn.preprocessing import PolynomialFeatures  # Class for generating polynomial features


# Import the train and test datasets
df_train = pd.read_csv(r'C:\Users\User\Desktop\GitHub-projects\projects\Data-Dives-Projects-Unleashed\dataSets\course 1\week 1\datasets\House_Price_Prediction\train _price_house.csv')
df_test = pd.read_csv(r'C:\Users\User\Desktop\GitHub-projects\projects\Data-Dives-Projects-Unleashed\dataSets\course 1\week 1\datasets\House_Price_Prediction\test.csv')


df_train.shape

(1460, 81)


df_train.head(10)


dtypes_train = (df_train.dtypes)
(df_train.dtypes).to_csv("dtypes_train.csv")

dtypes_test = (df_test.dtypes)
(df_train.dtypes).to_csv("dtypes_train.csv")

dtypes_train
dtypes_test

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 80, dtype: object


# Check for null values in the train dataset
null_values_train = df_train.isnull()

# Print the number of rows that have NaN values
num_rows_with_nan_train = len(df_train) - len(df_train.dropna())
num_rows_with_nan_train_with_thresh =  len(df_train) - len(df_train.dropna(thresh=66))

print(f"The number of rows that have NaN values: {num_rows_with_nan_train}, with thresh = 65: {num_rows_with_nan_train_with_thresh }")

The number of rows that have NaN values: 1460, with thresh = 65: 0


# Check for null values in the test dataset
null_values_test = df_test.isnull()

# Print the number of rows that have NaN values
num_rows_with_nan_test = len(df_test) - len(df_test.dropna())
num_rows_with_nan_test_with_thresh =  len(df_test) - len(df_test.dropna(thresh=65))

print(f"The number of rows that have NaN values: {num_rows_with_nan_test}, with thresh = 65 : {num_rows_with_nan_test_with_thresh }")

The number of rows that have NaN values: 1459, with thresh = 65 : 1


# Display information about the DataFrame
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallCond    1460 non-null   int64  
 19  YearBuilt      1460 non-null   int64  
 20  YearRemodAdd   1460 non-null   int64  
 21  RoofStyle      1460 non-null   object 
 22  RoofMatl       1460 non-null   object 
 23  Exterior1st    1460 non-null   object 
 24  Exterior2nd    1460 non-null   object 
 25  MasVnrType     1452 non-null   object 
 26  MasVnrArea     1452 non-null   float64
 27  ExterQual      1460 non-null   object 
 28  ExterCond      1460 non-null   object 
 29  Foundation     1460 non-null   object 
 30  BsmtQual       1423 non-null   object 
 31  BsmtCond       1423 non-null   object 
 32  BsmtExposure   1422 non-null   object 
 33  BsmtFinType1   1423 non-null   object 
 34  BsmtFinSF1     1460 non-null   int64  
 35  BsmtFinType2   1422 non-null   object 
 36  BsmtFinSF2     1460 non-null   int64  
 37  BsmtUnfSF      1460 non-null   int64  
 38  TotalBsmtSF    1460 non-null   int64  
 39  Heating        1460 non-null   object 
 40  HeatingQC      1460 non-null   object 
 41  CentralAir     1460 non-null   object 
 42  Electrical     1459 non-null   object 
 43  1stFlrSF       1460 non-null   int64  
 44  2ndFlrSF       1460 non-null   int64  
 45  LowQualFinSF   1460 non-null   int64  
 46  GrLivArea      1460 non-null   int64  
 47  BsmtFullBath   1460 non-null   int64  
 48  BsmtHalfBath   1460 non-null   int64  
 49  FullBath       1460 non-null   int64  
 50  HalfBath       1460 non-null   int64  
 51  BedroomAbvGr   1460 non-null   int64  
 52  KitchenAbvGr   1460 non-null   int64  
 53  KitchenQual    1460 non-null   object 
 54  TotRmsAbvGrd   1460 non-null   int64  
 55  Functional     1460 non-null   object 
 56  Fireplaces     1460 non-null   int64  
 57  FireplaceQu    770 non-null    object 
 58  GarageType     1379 non-null   object 
 59  GarageYrBlt    1379 non-null   float64
 60  GarageFinish   1379 non-null   object 
 61  GarageCars     1460 non-null   int64  
 62  GarageArea     1460 non-null   int64  
 63  GarageQual     1379 non-null   object 
 64  GarageCond     1379 non-null   object 
 65  PavedDrive     1460 non-null   object 
 66  WoodDeckSF     1460 non-null   int64  
 67  OpenPorchSF    1460 non-null   int64  
 68  EnclosedPorch  1460 non-null   int64  
 69  3SsnPorch      1460 non-null   int64  
 70  ScreenPorch    1460 non-null   int64  
 71  PoolArea       1460 non-null   int64  
 72  PoolQC         7 non-null      object 
 73  Fence          281 non-null    object 
 74  MiscFeature    54 non-null     object 
 75  MiscVal        1460 non-null   int64  
 76  MoSold         1460 non-null   int64  
 77  YrSold         1460 non-null   int64  
 78  SaleType       1460 non-null   object 
 79  SaleCondition  1460 non-null   object 
 80  SalePrice      1460 non-null   int64  
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB


# Get the columns with non-null values
non_null_columns = df_train.columns[df_train.notnull().any()].tolist()

# Print the columns with non-null values
print("Columns with non-null values:")
for column in non_null_columns:
    print(column)

Columns with non-null values:
Id
MSSubClass
MSZoning
LotFrontage
LotArea
Street
Alley
LotShape
LandContour
Utilities
LotConfig
LandSlope
Neighborhood
Condition1
Condition2
BldgType
HouseStyle
OverallQual
OverallCond
YearBuilt
YearRemodAdd
RoofStyle
RoofMatl
Exterior1st
Exterior2nd
MasVnrType
MasVnrArea
ExterQual
ExterCond
Foundation
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinSF1
BsmtFinType2
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
Heating
HeatingQC
CentralAir
Electrical
1stFlrSF
2ndFlrSF
LowQualFinSF
GrLivArea
BsmtFullBath
BsmtHalfBath
FullBath
HalfBath
BedroomAbvGr
KitchenAbvGr
KitchenQual
TotRmsAbvGrd
Functional
Fireplaces
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageCars
GarageArea
GarageQual
GarageCond
PavedDrive
WoodDeckSF
OpenPorchSF
EnclosedPorch
3SsnPorch
ScreenPorch
PoolArea
PoolQC
Fence
MiscFeature
MiscVal
MoSold
YrSold
SaleType
SaleCondition
SalePrice


# Get the columns with null values in the train data
null_columns = df_train.columns[df_train.isnull().any()].tolist()

# Print the data frames with null values
print("Data frames with null values in train data:")
for column in null_columns:
    print(column,"------>", df_train[column].dtype)

Data frames with null values in train data:
LotFrontage ------> float64
Alley ------> object
MasVnrType ------> object
MasVnrArea ------> float64
BsmtQual ------> object
BsmtCond ------> object
BsmtExposure ------> object
BsmtFinType1 ------> object
BsmtFinType2 ------> object
Electrical ------> object
FireplaceQu ------> object
GarageType ------> object
GarageYrBlt ------> float64
GarageFinish ------> object
GarageQual ------> object
GarageCond ------> object
PoolQC ------> object
Fence ------> object
MiscFeature ------> object


# Get the columns with null values in the test data
null_columns = df_test.columns[df_test.isnull().any()].tolist()

# Print the data frames with null values
print("Data frames with null values in test data:")
for column in null_columns:
    print(column)

Data frames with null values in test data:
MSZoning
LotFrontage
Alley
Utilities
Exterior1st
Exterior2nd
MasVnrType
MasVnrArea
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinSF1
BsmtFinType2
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
BsmtFullBath
BsmtHalfBath
KitchenQual
Functional
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageCars
GarageArea
GarageQual
GarageCond
PoolQC
Fence
MiscFeature
SaleType


# Calculate basic statistics for each column using the describe() method
statistics = df_train.describe()

# Transpose the statistics DataFrame to have columns as variable names
statistics_transposed = statistics.T

# Print the transposed statistics DataFrame
print("Descriptive statistics for each column in train data:")
statistics_transposed

Descriptive statistics for each column in train data:


# Calculate basic statistics for each column in test data using the describe() method
statistics = df_test.describe()

# Transpose the statistics DataFrame to have columns as variable names
statistics_transposed = statistics.T

# Print the transposed statistics DataFrame
print("Descriptive statistics for each column in test data:")
statistics_transposed

Descriptive statistics for each column in test data:


# Create a histogram of the target variable 'SalePrice'
plt.figure(figsize=(8, 6))
plt.hist(df_train['SalePrice'], bins=30, edgecolor='black')
plt.xlabel('Sale Price')
plt.ylabel('Frequency')
plt.title('Distribution of Sale Price')
plt.show()


# Create a boxplot of the target variable 'SalePrice'
plt.figure(figsize=(8, 6))
plt.boxplot(df_train['SalePrice'])
plt.xlabel('Sale Price')
plt.title('Distribution of Sale Price')
plt.show()


# Define the columns suitable for histogram visualization
histogram_columns = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                     '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
                     'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'SalePrice']

# Create an interactive function to plot the histogram
@interact
def plot_histogram(column=histogram_columns):
    # Create a Figure object
    fig = go.Figure()

    # Plot the histogram using Plotly
    fig.add_trace(go.Histogram(x=df_train[column], nbinsx=30, marker=dict(color='steelblue')))

    # Update layout
    fig.update_layout(
        title=f"Distribution of {column} in Train Data",
        xaxis_title=column,
        yaxis_title="Frequency",
        bargap=0.1,
        bargroupgap=0.2,
        template="plotly_white"
    )

    # Show the figure
    fig.show()

    # Plot the histogram using Matplotlib and Seaborn
    plt.figure(figsize=(8, 6))
    sns.histplot(data=df_train, x=column, kde=True, color='steelblue')
    plt.title(f"Distribution of {column} in Train Data")
    plt.xlabel(column)
    plt.ylabel("Frequency")
    plt.grid(True)
    plt.show()


# Define the columns suitable for histogram visualization
histogram_columns = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                     '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
                     'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'SalePrice']

# Create an interactive function to plot the histogram
@interact
def plot_histogram(column=histogram_columns):
    # Create a histogram using Plotly Express
    fig = px.histogram(df_train, x=column, nbins=30, marginal="rug", color_discrete_sequence=["steelblue"])

    # Update layout
    fig.update_layout(
        title=f"Distribution of {column}",
        xaxis_title=column,
        yaxis_title="Frequency",
        bargap=0.1,
        bargroupgap=0.2,
        template="plotly_white"
    )

    # Show the histogram
    fig.show()


# Create an interactive function to plot the histograms
@interact(column=histogram_columns)
def plot_histogram(column):
    # Check if the column is in df_test
    if column not in df_test.columns:
        print(f"{column} is not present in df_test.")
        return
    
    # Create a figure with two subplots
    fig, axes = plt.subplots(2, 1, figsize=(8, 10))
    
    # Plot histogram for df_train
    sns.histplot(data=df_train, x=column, kde=True, color='steelblue', ax=axes[0])
    axes[0].set_title(f"Train Data - Distribution of {column}")
    axes[0].set_xlabel(column)
    axes[0].set_ylabel("Frequency")
    axes[0].grid(True)
    
    # Plot histogram for df_test
    sns.histplot(data=df_test, x=column, kde=True, color='lightseagreen', ax=axes[1])
    axes[1].set_title(f"Test Data - Distribution of {column}")
    axes[1].set_xlabel(column)
    axes[1].set_ylabel("Frequency")
    axes[1].grid(True)
    
    # Adjust spacing between subplots
    fig.tight_layout()
    
    # Show the plots
    plt.show()


# Create an interactive function to plot the histogram
@interact
def plot_histogram(column=histogram_columns):
    # Remove missing values
    data = df_train[column].dropna()

    # Calculate the frequency of each bin
    hist_data, bin_edges = np.histogram(data, bins=30)

    # Calculate the color scale based on the frequency
    colorscale = ['#f0f0f0', '#0074D9']

    # Create a bar trace for the histogram
    trace = go.Bar(
        x=data,
        marker=dict(
            color=data,
            colorscale=colorscale,
            line=dict(color='black', width=0.5)
        ),
        opacity=0.8
    )

    # Create a layout for the histogram
    layout = go.Layout(
        title=f"Distribution of {column}",
        xaxis=dict(
            title=column,
            gridcolor='#eee'
        ),
        yaxis=dict(
            title='Frequency',
            gridcolor='#eee'
        ),
        plot_bgcolor='rgba(0, 0, 0, 0)',
        paper_bgcolor='rgba(0, 0, 0, 0)'
    )

    # Create a figure and add the trace
    fig = go.Figure(data=[trace], layout=layout)

    # Show the histogram
    fig.show()


import seaborn as sns

# Select the numerical columns
numerical_columns = df_train.select_dtypes(include=np.number).columns.tolist()


# Convert the columns to numeric
df_train[numerical_columns] = df_train[numerical_columns].apply(pd.to_numeric, errors='coerce')

# Create a scatter matrix plot
sns.set(style='ticks')
sns.pairplot(df_train[numerical_columns].dropna(), height=2.5)
plt.suptitle("Scatter Matrix Plot", y=1.02)
plt.show()


import seaborn as sns

# Select the numerical columns
numerical_columns = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                     '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
                     'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'SalePrice']

# Convert the columns to numeric
df_train[numerical_columns] = df_train[numerical_columns].apply(pd.to_numeric, errors='coerce')

# Create a scatter matrix plot
sns.set(style='ticks')
sns.pairplot(df_train[numerical_columns].dropna(), height=2.5)
plt.suptitle("Scatter Matrix Plot", y=1.02)
plt.show()


# Select the categorical columns
categorical_columns = df_train.select_dtypes(include='object').columns.tolist()

# Loop through each categorical column
for column in categorical_columns:
    unique_values = df_train[column].unique()
    value_counts = df_train[column].value_counts()
    
    print(f"Column: {column}")
    print(f"Unique Values: {unique_values}")
    print(f"Value Counts:\n{value_counts}")
    print("-" * 50)

Column: MSZoning
Unique Values: ['RL' 'RM' 'C (all)' 'FV' 'RH']
Value Counts:
RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64
--------------------------------------------------
Column: Street
Unique Values: ['Pave' 'Grvl']
Value Counts:
Pave    1454
Grvl       6
Name: Street, dtype: int64
--------------------------------------------------
Column: Alley
Unique Values: [nan 'Grvl' 'Pave']
Value Counts:
Grvl    50
Pave    41
Name: Alley, dtype: int64
--------------------------------------------------
Column: LotShape
Unique Values: ['Reg' 'IR1' 'IR2' 'IR3']
Value Counts:
Reg    925
IR1    484
IR2     41
IR3     10
Name: LotShape, dtype: int64
--------------------------------------------------
Column: LandContour
Unique Values: ['Lvl' 'Bnk' 'Low' 'HLS']
Value Counts:
Lvl    1311
Bnk      63
HLS      50
Low      36
Name: LandContour, dtype: int64
--------------------------------------------------
Column: Utilities
Unique Values: ['AllPub' 'NoSeWa']
Value Counts:
AllPub    1459
NoSeWa       1
Name: Utilities, dtype: int64
--------------------------------------------------
Column: LotConfig
Unique Values: ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']
Value Counts:
Inside     1052
Corner      263
CulDSac      94
FR2          47
FR3           4
Name: LotConfig, dtype: int64
--------------------------------------------------
Column: LandSlope
Unique Values: ['Gtl' 'Mod' 'Sev']
Value Counts:
Gtl    1382
Mod      65
Sev      13
Name: LandSlope, dtype: int64
--------------------------------------------------
Column: Neighborhood
Unique Values: ['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']
Value Counts:
NAmes      225
CollgCr    150
OldTown    113
Edwards    100
Somerst     86
Gilbert     79
NridgHt     77
Sawyer      74
NWAmes      73
SawyerW     59
BrkSide     58
Crawfor     51
Mitchel     49
NoRidge     41
Timber      38
IDOTRR      37
ClearCr     28
StoneBr     25
SWISU       25
MeadowV     17
Blmngtn     17
BrDale      16
Veenker     11
NPkVill      9
Blueste      2
Name: Neighborhood, dtype: int64
--------------------------------------------------
Column: Condition1
Unique Values: ['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']
Value Counts:
Norm      1260
Feedr       81
Artery      48
RRAn        26
PosN        19
RRAe        11
PosA         8
RRNn         5
RRNe         2
Name: Condition1, dtype: int64
--------------------------------------------------
Column: Condition2
Unique Values: ['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']
Value Counts:
Norm      1445
Feedr        6
Artery       2
RRNn         2
PosN         2
PosA         1
RRAn         1
RRAe         1
Name: Condition2, dtype: int64
--------------------------------------------------
Column: BldgType
Unique Values: ['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']
Value Counts:
1Fam      1220
TwnhsE     114
Duplex      52
Twnhs       43
2fmCon      31
Name: BldgType, dtype: int64
--------------------------------------------------
Column: HouseStyle
Unique Values: ['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']
Value Counts:
1Story    726
2Story    445
1.5Fin    154
SLvl       65
SFoyer     37
1.5Unf     14
2.5Unf     11
2.5Fin      8
Name: HouseStyle, dtype: int64
--------------------------------------------------
Column: RoofStyle
Unique Values: ['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed']
Value Counts:
Gable      1141
Hip         286
Flat         13
Gambrel      11
Mansard       7
Shed          2
Name: RoofStyle, dtype: int64
--------------------------------------------------
Column: RoofMatl
Unique Values: ['CompShg' 'WdShngl' 'Metal' 'WdShake' 'Membran' 'Tar&Grv' 'Roll'
 'ClyTile']
Value Counts:
CompShg    1434
Tar&Grv      11
WdShngl       6
WdShake       5
Metal         1
Membran       1
Roll          1
ClyTile       1
Name: RoofMatl, dtype: int64
--------------------------------------------------
Column: Exterior1st
Unique Values: ['VinylSd' 'MetalSd' 'Wd Sdng' 'HdBoard' 'BrkFace' 'WdShing' 'CemntBd'
 'Plywood' 'AsbShng' 'Stucco' 'BrkComm' 'AsphShn' 'Stone' 'ImStucc'
 'CBlock']
Value Counts:
VinylSd    515
HdBoard    222
MetalSd    220
Wd Sdng    206
Plywood    108
CemntBd     61
BrkFace     50
WdShing     26
Stucco      25
AsbShng     20
BrkComm      2
Stone        2
AsphShn      1
ImStucc      1
CBlock       1
Name: Exterior1st, dtype: int64
--------------------------------------------------
Column: Exterior2nd
Unique Values: ['VinylSd' 'MetalSd' 'Wd Shng' 'HdBoard' 'Plywood' 'Wd Sdng' 'CmentBd'
 'BrkFace' 'Stucco' 'AsbShng' 'Brk Cmn' 'ImStucc' 'AsphShn' 'Stone'
 'Other' 'CBlock']
Value Counts:
VinylSd    504
MetalSd    214
HdBoard    207
Wd Sdng    197
Plywood    142
CmentBd     60
Wd Shng     38
Stucco      26
BrkFace     25
AsbShng     20
ImStucc     10
Brk Cmn      7
Stone        5
AsphShn      3
Other        1
CBlock       1
Name: Exterior2nd, dtype: int64
--------------------------------------------------
Column: MasVnrType
Unique Values: ['BrkFace' 'None' 'Stone' 'BrkCmn' nan]
Value Counts:
None       864
BrkFace    445
Stone      128
BrkCmn      15
Name: MasVnrType, dtype: int64
--------------------------------------------------
Column: ExterQual
Unique Values: ['Gd' 'TA' 'Ex' 'Fa']
Value Counts:
TA    906
Gd    488
Ex     52
Fa     14
Name: ExterQual, dtype: int64
--------------------------------------------------
Column: ExterCond
Unique Values: ['TA' 'Gd' 'Fa' 'Po' 'Ex']
Value Counts:
TA    1282
Gd     146
Fa      28
Ex       3
Po       1
Name: ExterCond, dtype: int64
--------------------------------------------------
Column: Foundation
Unique Values: ['PConc' 'CBlock' 'BrkTil' 'Wood' 'Slab' 'Stone']
Value Counts:
PConc     647
CBlock    634
BrkTil    146
Slab       24
Stone       6
Wood        3
Name: Foundation, dtype: int64
--------------------------------------------------
Column: BsmtQual
Unique Values: ['Gd' 'TA' 'Ex' nan 'Fa']
Value Counts:
TA    649
Gd    618
Ex    121
Fa     35
Name: BsmtQual, dtype: int64
--------------------------------------------------
Column: BsmtCond
Unique Values: ['TA' 'Gd' nan 'Fa' 'Po']
Value Counts:
TA    1311
Gd      65
Fa      45
Po       2
Name: BsmtCond, dtype: int64
--------------------------------------------------
Column: BsmtExposure
Unique Values: ['No' 'Gd' 'Mn' 'Av' nan]
Value Counts:
No    953
Av    221
Gd    134
Mn    114
Name: BsmtExposure, dtype: int64
--------------------------------------------------
Column: BsmtFinType1
Unique Values: ['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' nan 'LwQ']
Value Counts:
Unf    430
GLQ    418
ALQ    220
BLQ    148
Rec    133
LwQ     74
Name: BsmtFinType1, dtype: int64
--------------------------------------------------
Column: BsmtFinType2
Unique Values: ['Unf' 'BLQ' nan 'ALQ' 'Rec' 'LwQ' 'GLQ']
Value Counts:
Unf    1256
Rec      54
LwQ      46
BLQ      33
ALQ      19
GLQ      14
Name: BsmtFinType2, dtype: int64
--------------------------------------------------
Column: Heating
Unique Values: ['GasA' 'GasW' 'Grav' 'Wall' 'OthW' 'Floor']
Value Counts:
GasA     1428
GasW       18
Grav        7
Wall        4
OthW        2
Floor       1
Name: Heating, dtype: int64
--------------------------------------------------
Column: HeatingQC
Unique Values: ['Ex' 'Gd' 'TA' 'Fa' 'Po']
Value Counts:
Ex    741
TA    428
Gd    241
Fa     49
Po      1
Name: HeatingQC, dtype: int64
--------------------------------------------------
Column: CentralAir
Unique Values: ['Y' 'N']
Value Counts:
Y    1365
N      95
Name: CentralAir, dtype: int64
--------------------------------------------------
Column: Electrical
Unique Values: ['SBrkr' 'FuseF' 'FuseA' 'FuseP' 'Mix' nan]
Value Counts:
SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: Electrical, dtype: int64
--------------------------------------------------
Column: KitchenQual
Unique Values: ['Gd' 'TA' 'Ex' 'Fa']
Value Counts:
TA    735
Gd    586
Ex    100
Fa     39
Name: KitchenQual, dtype: int64
--------------------------------------------------
Column: Functional
Unique Values: ['Typ' 'Min1' 'Maj1' 'Min2' 'Mod' 'Maj2' 'Sev']
Value Counts:
Typ     1360
Min2      34
Min1      31
Mod       15
Maj1      14
Maj2       5
Sev        1
Name: Functional, dtype: int64
--------------------------------------------------
Column: FireplaceQu
Unique Values: [nan 'TA' 'Gd' 'Fa' 'Ex' 'Po']
Value Counts:
Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: FireplaceQu, dtype: int64
--------------------------------------------------
Column: GarageType
Unique Values: ['Attchd' 'Detchd' 'BuiltIn' 'CarPort' nan 'Basment' '2Types']
Value Counts:
Attchd     870
Detchd     387
BuiltIn     88
Basment     19
CarPort      9
2Types       6
Name: GarageType, dtype: int64
--------------------------------------------------
Column: GarageFinish
Unique Values: ['RFn' 'Unf' 'Fin' nan]
Value Counts:
Unf    605
RFn    422
Fin    352
Name: GarageFinish, dtype: int64
--------------------------------------------------
Column: GarageQual
Unique Values: ['TA' 'Fa' 'Gd' nan 'Ex' 'Po']
Value Counts:
TA    1311
Fa      48
Gd      14
Ex       3
Po       3
Name: GarageQual, dtype: int64
--------------------------------------------------
Column: GarageCond
Unique Values: ['TA' 'Fa' nan 'Gd' 'Po' 'Ex']
Value Counts:
TA    1326
Fa      35
Gd       9
Po       7
Ex       2
Name: GarageCond, dtype: int64
--------------------------------------------------
Column: PavedDrive
Unique Values: ['Y' 'N' 'P']
Value Counts:
Y    1340
N      90
P      30
Name: PavedDrive, dtype: int64
--------------------------------------------------
Column: PoolQC
Unique Values: [nan 'Ex' 'Fa' 'Gd']
Value Counts:
Gd    3
Ex    2
Fa    2
Name: PoolQC, dtype: int64
--------------------------------------------------
Column: Fence
Unique Values: [nan 'MnPrv' 'GdWo' 'GdPrv' 'MnWw']
Value Counts:
MnPrv    157
GdPrv     59
GdWo      54
MnWw      11
Name: Fence, dtype: int64
--------------------------------------------------
Column: MiscFeature
Unique Values: [nan 'Shed' 'Gar2' 'Othr' 'TenC']
Value Counts:
Shed    49
Gar2     2
Othr     2
TenC     1
Name: MiscFeature, dtype: int64
--------------------------------------------------
Column: SaleType
Unique Values: ['WD' 'New' 'COD' 'ConLD' 'ConLI' 'CWD' 'ConLw' 'Con' 'Oth']
Value Counts:
WD       1267
New       122
COD        43
ConLD       9
ConLI       5
ConLw       5
CWD         4
Oth         3
Con         2
Name: SaleType, dtype: int64
--------------------------------------------------
Column: SaleCondition
Unique Values: ['Normal' 'Abnorml' 'Partial' 'AdjLand' 'Alloca' 'Family']
Value Counts:
Normal     1198
Partial     125
Abnorml     101
Family       20
Alloca       12
AdjLand       4
Name: SaleCondition, dtype: int64
--------------------------------------------------


from ipywidgets import interact

# Select the categorical columns
categorical_columns = df_train.select_dtypes(include='object').columns.tolist()

@interact(column=categorical_columns)
def plot_categorical(column):
    # Get the value counts for df_train
    train_value_counts = df_train[column].value_counts()
    
    # Plot the value counts for df_train using a bar plot
    plt.figure(figsize=(10, 6))
    sns.barplot(x=train_value_counts.index, y=train_value_counts.values, palette='viridis')
    plt.title(f"Value Counts of {column} (Train Data)")
    plt.xlabel(column)
    plt.ylabel("Frequency")
    plt.xticks(rotation=45, ha='right')
    plt.show()
    
    # Print the unique values for df_train
    train_unique_values = df_train[column].unique()
    print(f"Train Data - Column: {column}")
    print(f"Unique Values: {train_unique_values}")
    print("-" * 50)
    
    # Check if the column exists in df_test
    if column in df_test.columns:
        # Get the value counts for df_test
        test_value_counts = df_test[column].value_counts()

        # Plot the value counts for df_test using a bar plot
        plt.figure(figsize=(10, 6))
        sns.barplot(x=test_value_counts.index, y=test_value_counts.values, palette='viridis')
        plt.title(f"Value Counts of {column} (Test Data)")
        plt.xlabel(column)
        plt.ylabel("Frequency")
        plt.xticks(rotation=45, ha='right')
        plt.show()

        # Print the unique values for df_test
        test_unique_values = df_test[column].unique()
        print(f"Test Data - Column: {column}")
        print(f"Unique Values: {test_unique_values}")
        print("-" * 50)
    else:
        print(f"Test Data - Column: {column}")
        print("This column does not exist in the test data.")
        print("-" * 50)


# Select the categorical columns
categorical_columns = df_train.select_dtypes(include='object').columns.tolist()

@interact(column=categorical_columns)
def plot_categorical(column):
    # Plot the count plot for df_train
    plt.figure(figsize=(12, 6))
    sns.countplot(data=df_train, x=column, palette='viridis')
    plt.title(f"Distribution of {column} (Train Data)")
    plt.xlabel(column)
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha='right')
    plt.show()

    # Print the value counts for df_train
    train_value_counts = df_train[column].value_counts()
    print(f"Train Data - Column: {column}")
    print(f"Value Counts:\n{train_value_counts}")
    print("-" * 50)
    
    # Check if the column exists in df_test
    if column in df_test.columns:
        # Plot the count plot for df_test
        plt.figure(figsize=(12, 6))
        sns.countplot(data=df_test, x=column, palette='viridis')
        plt.title(f"Distribution of {column} (Test Data)")
        plt.xlabel(column)
        plt.ylabel("Count")
        plt.xticks(rotation=45, ha='right')
        plt.show()

        # Print the value counts for df_test
        test_value_counts = df_test[column].value_counts()
        print(f"Test Data - Column: {column}")
        print(f"Value Counts:\n{test_value_counts}")
        print("-" * 50)
    else:
        print(f"Test Data - Column: {column}")
        print("This column does not exist in the test data.")
        print("-" * 50)


import plotly.graph_objects as go

# Calculate the correlation matrix
correlation_matrix = df_train.corr()

# Get the column names
columns = correlation_matrix.columns

# Create a hovertext for the correlation matrix
hovertext = [["<b>Correlation:</b> {:.2f}".format(correlation_matrix.iloc[i, j]), "<b>Column 1:</b> {}<br><b>Column 2:</b> {}".format(columns[i], columns[j])]
             for i in range(len(columns))
             for j in range(len(columns))]

# Create a trace for the correlation matrix heatmap
trace = go.Heatmap(
    z=correlation_matrix.values,
    x=columns,
    y=columns,
    hovertemplate='<b>Correlation:</b> %{z:.2f}<br>',
    text=hovertext,
    colorscale='RdYlBu',  # You can customize the colorscale if desired
)

# Create a layout for the plot
layout = go.Layout(
    title="Correlation Matrix",
    xaxis=dict(
        title="Column",
        tickangle=-45,  # Rotate the tick labels for better readability
        tickfont=dict(size=12),  # Adjust the font size of the tick labels
    ),
    yaxis=dict(
        title="Column",
        tickangle=-45,  # Rotate the tick labels for better readability
        tickfont=dict(size=12),  # Adjust the font size of the tick labels
    ),
    width=1000,  # Set the width of the plot
    height=1000,  # Set the height of the plot
)

# Create a figure and add the trace
fig = go.Figure(data=[trace], layout=layout)

# Show the plot
fig.show()


from IPython.display import IFrame
corr_matrix = IFrame(src = "https://plotly.com/~zeed/1/", width =1400 , height = 1000)
display(corr_matrix)


# Define the columns suitable for outlier investigation
outlier_columns = df_train.select_dtypes(include=np.number).columns.tolist()

@interact(column=outlier_columns)
def plot_outliers(column):
    if column == 'SalePrice':
        # Plot boxplot and scatter plot for df_train
        plt.figure(figsize=(8, 6))
        sns.boxplot(data=df_train, y=column, color='steelblue')
        plt.title(f"Boxplot: {column} (Train Data)")
        plt.ylabel(column)
        plt.show()

        plt.figure(figsize=(8, 6))
        sns.scatterplot(data=df_train, x=column, y='SalePrice', color='steelblue')
        plt.title(f"Scatter Plot: {column} vs SalePrice (Train Data)")
        plt.xlabel(column)
        plt.ylabel("SalePrice")
        plt.grid(True)
        plt.show()

        # Print message for df_test
        print("Test Data - Column: SalePrice")
        print("This column does not exist in the test data.")
        print("-" * 50)
    else:
        # Plot boxplot for df_train
        plt.figure(figsize=(8, 6))
        sns.boxplot(data=df_train, y=column, color='steelblue')
        plt.title(f"Boxplot: {column} (Train Data)")
        plt.ylabel(column)
        plt.show()

        # Plot boxplot for df_test if column exists
        if column in df_test.columns and "SalePrice" in df_test.columns:
            plt.figure(figsize=(8, 6))
            sns.boxplot(data=df_test, y=column, color='steelblue')
            plt.title(f"Boxplot: {column} (Test Data)")
            plt.ylabel(column)
            plt.show()
        else:
            print(f"Test Data - Column: {column}")
            print("This column does not exist in the test data.")
            print("-" * 50)

        # Plot scatter plot for df_train
        plt.figure(figsize=(8, 6))
        sns.scatterplot(data=df_train, x=column, y='SalePrice', color='steelblue')
        plt.title(f"Scatter Plot: {column} vs SalePrice (Train Data)")
        plt.xlabel(column)
        plt.ylabel("SalePrice")
        plt.grid(True)
        plt.show()

        # Plot scatter plot for df_test if column exists
        if column in df_test.columns and "SalePrice" in df_test.columns:
            plt.figure(figsize=(8, 6))
            sns.scatterplot(data=df_test, x=column, y='SalePrice', color='steelblue')
            plt.title(f"Scatter Plot: {column} vs SalePrice (Test Data)")
            plt.xlabel(column)
            plt.ylabel("SalePrice")
            plt.grid(True)
            plt.show()
        else:
            print(f"Test Data - Column: {column}")
            print("This column does not exist in the test data.")
            print("-" * 50)


# Define the columns suitable for outlier investigation
outlier_columns = df_train.select_dtypes(include=np.number).columns.tolist()
outlier_columns.remove("SalePrice")
outlier_columns.remove("Id")
@interact(column1=outlier_columns, column2=outlier_columns)
def plot_outliers(column1, column2):
    # Plot boxplot and scatter plot for df_train
    plt.figure(figsize=(16, 6))
    
    # Boxplot for column1 in df_train
    plt.subplot(1, 2, 1)
    sns.boxplot(data=df_train, y=column1, color='steelblue')
    plt.title(f"Boxplot: {column1} (Train Data)")
    plt.ylabel(column1)
    
    # Scatter plot for column2 vs column1 in df_train
    plt.subplot(1, 2, 2)
    sns.scatterplot(data=df_train, x=column2, y=column1, color='steelblue')
    plt.title(f"Scatter Plot: {column2} vs {column1} (Train Data)")
    plt.xlabel(column2)
    plt.ylabel("SalePrice")
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()

    # Plot boxplot and scatter plot for df_test if column exists
    if column1 in df_test.columns and column2 in df_test.columns:
        plt.figure(figsize=(16, 6))
        
        # Boxplot for column1 in df_test
        plt.subplot(1, 2, 1)
        sns.boxplot(data=df_test, y=column1, color='steelblue')
        plt.title(f"Boxplot: {column1} (Test Data)")
        plt.ylabel(column1)
        
        # Scatter plot for column1 vs column2 in df_train
        plt.subplot(1, 2, 2)
        sns.scatterplot(data=df_train, x=column1, y=column2, color='steelblue')
        plt.title(f"Scatter Plot: {column1} vs {column2} (Train Data)")
        plt.xlabel(column2)
        plt.ylabel("SalePrice")
        plt.grid(True)
        plt.tight_layout()
        
        plt.show()
    else:
        print("One or both of the selected columns do not exist in the test data.")


import pandas as pd
import matplotlib.pyplot as plt

# Calculate the percentage of missing values in each column
missing_percentage = df_train.isnull().mean() * 100

# Sort the columns based on the missing percentage in descending order
sorted_missing = missing_percentage.sort_values(ascending=False)

# Plot the distribution of missing values
plt.figure(figsize=(24, 6))
sorted_missing.plot(kind='bar', color='steelblue')
plt.xlabel('Features')
plt.ylabel('Missing Percentage')
plt.title('Distribution of Missing Values')
plt.xticks(rotation=45, ha='right')
plt.show()


import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# Group the data by a categorical feature (e.g., neighborhood)
grouped_data = df_train.groupby('Neighborhood')

# Select a numerical feature of interest
numerical_feature = 'YearBuilt'

# Create a font dictionary for bold text
font = {'fontweight': 'bold'}

# Plot the box plot for the selected feature grouped by the categorical feature
plt.figure(figsize=(24, 12))
sns.boxplot(x='Neighborhood', y=numerical_feature, data=df_train)
plt.xlabel('Neighborhood', **font)
plt.ylabel("Year Built", **font)
plt.title('Distribution of {} by Neighborhood'.format(numerical_feature), **font)
plt.xticks(rotation=45)
plt.show()


df_train.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object


# Define the threshold for the percentage of missing values
missing_threshold = 0.5

# Calculate the percentage of missing values in each column
missing_percentage = df_train.isnull().mean()

# Identify columns with missing values exceeding the threshold and no significant impact on SalePrice
columns_to_drop = []
for column in df_train.columns:
    if missing_percentage[column] > missing_threshold:
        if df_train[column].dtype == 'object':
            # Use label encoding or one-hot encoding for categorical columns
            encoded_column = pd.factorize(df_train[column])[0]  # Label encoding
            # or
            encoded_column = pd.get_dummies(df_train[column], prefix=column)  # One-hot encoding
            
            # Calculate the correlation between the encoded column and SalePrice
            correlation = encoded_column.corrwith(df_train['SalePrice']).values[0]
            
            # Check if the absolute correlation value is less than 0.5
            if abs(correlation) < 0.4:
                columns_to_drop.append(column)
        else:
            # Calculate the correlation between the column and SalePrice
            correlation = df_train[column].corr(df_train['SalePrice'])
            
            # Check if the absolute correlation value is less than 0.5
            if abs(correlation) < 0.5:
                columns_to_drop.append(column)

# Drop the identified columns from the dataset
#df_train.drop(columns_to_drop, axis=1, inplace=True)

# Print the updated dataframe after dropping the columns
df_train.head()


# Calculate the percentage of missing values in each column
missing_percentage = df_train.isnull().mean()

# Identify columns with no significant impact on SalePrice
#columns_to_drop = []
for column in df_train.columns:
    
        if df_train[column].dtype == 'object':
            # Use label encoding or one-hot encoding for categorical columns
            encoded_column = pd.factorize(df_train[column])[0]  # Label encoding
            # or
            encoded_column = pd.get_dummies(df_train[column], prefix=column)  # One-hot encoding
            
            # Calculate the correlation between the encoded column and SalePrice
            correlation = encoded_column.corrwith(df_train['SalePrice']).values[0]
            
            # Check if the absolute correlation value is less than 0.5
            if abs(correlation) < 0.5:
                columns_to_drop.append(column)
        else:
            # Calculate the correlation between the column and SalePrice
            correlation = df_train[column].corr(df_train['SalePrice'])
            
            # Check if the absolute correlation value is less than 0.5
            if abs(correlation) < 0.4:
                columns_to_drop.append(column)

# Drop the identified columns from the dataset
#df_train.drop(columns_to_drop, axis=1, inplace=True)

# Print the updated dataframe after dropping the columns
print(columns_to_drop)
df_train.head()

['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallCond', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']


len(df_train.columns)  - len(columns_to_drop) 
df_train.columns.tolist()

['Id',
 'MSSubClass',
 'MSZoning',
 'LotFrontage',
 'LotArea',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SaleType',
 'SaleCondition',
 'SalePrice']


columns_to_keep = [x for x in df_train.columns if x not in columns_to_drop]
columns_to_keep
len(columns_to_keep)

16


import seaborn as sns

def plot_dropped_columns():
    numeric_columns_to_drop = [column for column in columns_to_drop if df_train[column].dtype != 'object']
    correlations = []
    for column in numeric_columns_to_drop:
        correlation = df_train[column].corr(df_train['SalePrice'])
        correlations.append(correlation)

    plt.figure(figsize=(20, 12))
    plt.bar(numeric_columns_to_drop, correlations)
    plt.xlabel('Columns')
    plt.ylabel('Correlation with SalePrice')
    plt.title('Correlation of Dropped Columns with SalePrice')
    plt.xticks(rotation=45)
    plt.show()

    if len(columns_to_drop) > 0:
        plt.figure(figsize=(20, 12))
        missing_data = df_train[columns_to_drop].isnull().sum()
        missing_data.plot(kind='bar')
        plt.xlabel('Columns')
        plt.ylabel('Number of Missing Values')
        plt.title('Missing Values in Dropped Columns')
        plt.xticks(rotation=90)
        plt.show()
    else:
        print("No columns to drop with missing values.")

def plot_kept_columns():
    plt.figure(figsize=(10, len(columns_to_keep)*0.5))
    kept_columns_corr = df_train[columns_to_keep].corrwith(df_train['SalePrice']).sort_values()
    kept_columns_corr.plot(kind='barh')
    plt.xlabel('Correlation with SalePrice')
    plt.ylabel('Columns')
    plt.title('Correlation of Kept Columns with SalePrice')
    plt.show()

    plt.figure(figsize=(10, len(columns_to_keep)*0.5))
    sns.heatmap(df_train[columns_to_keep].isnull(), cmap='viridis')
    plt.title('Missing Values in Kept Columns')
    plt.xticks(rotation=45)
    plt.show()

@interact(option=['Dropped Columns', 'Kept Columns'])
def plot_columns(option):
    if option == 'Dropped Columns':
        plot_dropped_columns()
    elif option == 'Kept Columns':
        plot_kept_columns()


def plot_dropped_columns():
    numeric_columns_to_drop = [column for column in columns_to_drop if df_train[column].dtype != 'object']
    correlations = []
    for column in numeric_columns_to_drop:
        correlation = df_train[column].corr(df_train['SalePrice'])
        correlations.append(correlation)

    plt.figure(figsize=(10, 6))
    plt.bar(numeric_columns_to_drop, correlations)
    plt.xlabel('Columns')
    plt.ylabel('Correlation with SalePrice')
    plt.title('Correlation of Dropped Columns with SalePrice')
    plt.xticks(rotation=90)
    plt.show()

    if len(columns_to_drop) > 0:
        plt.figure(figsize=(10, 6))
        missing_data = df_train[columns_to_drop].isnull().sum()
        missing_data.plot(kind='bar')
        plt.xlabel('Columns')
        plt.ylabel('Number of Missing Values')
        plt.title('Missing Values in Dropped Columns')
        plt.xticks(rotation=90)
        plt.show()
    else:
        print("No columns to drop with missing values.")

def plot_kept_columns():
    plt.figure(figsize=(10, len(columns_to_keep)*0.5))
    kept_columns_corr = df_train[columns_to_keep].corrwith(df_train['SalePrice']).sort_values()
    kept_columns_corr.plot(kind='barh')
    plt.xlabel('Correlation with SalePrice')
    plt.ylabel('Columns')
    plt.title('Correlation of Kept Columns with SalePrice')
    plt.show()

    plt.figure(figsize=(10, len(columns_to_keep)*0.5))
    sns.heatmap(df_train[columns_to_keep].isnull(), cmap='viridis')
    plt.title('Missing Values in Kept Columns')
    plt.show()

@interact(option=['Dropped Columns', 'Kept Columns'])
def plot_columns(option):
    if option == 'Dropped Columns':
        plot_dropped_columns()
    elif option == 'Kept Columns':
        plot_kept_columns()


df_train.drop(columns_to_drop, axis=1, inplace=True)


df_train


import pandas as pd

# Define the correct data types for each column
correct_dtypes = {
    'Id': int,
    'MSSubClass': 'category',
    'MSZoning': 'category',
    'LotFrontage': float,
    'LotArea': int,
    'Street': 'category',
    'Alley': 'category',
    'LotShape': 'category',
    'LandContour': 'category',
    'Utilities': 'category',
    'LotConfig': 'category',
    'LandSlope': 'category',
    'Neighborhood': 'category',
    'Condition1': 'category',
    'Condition2': 'category',
    'BldgType': 'category',
    'HouseStyle': 'category',
    'OverallQual': pd.CategoricalDtype(categories=[1,2,3,4,5,6,7,8,9,10], ordered=True),
    'OverallCond': pd.CategoricalDtype(categories=[1,2,3,4,5,6,7,8,9,10], ordered=True),
    'YearBuilt': int,
    'YearRemodAdd': int,
    'RoofStyle': 'category',
    'RoofMatl': 'category',
    'Exterior1st': 'category',
    'Exterior2nd': 'category',
    'MasVnrType': 'category',
    'MasVnrArea': float,
    'ExterQual': 'category',
    'ExterCond': 'category',
    'Foundation': 'category',
    'BsmtQual': 'category',
    'BsmtCond': 'category',
    'BsmtExposure': 'category',
    'BsmtFinType1': 'category',
    'BsmtFinSF1': int,
    'BsmtFinType2': 'category',
    'BsmtFinSF2': int,
    'BsmtUnfSF': int,
    'TotalBsmtSF': int,
    'Heating': 'category',
    'HeatingQC': 'category',
    'CentralAir': 'category',
    'Electrical': 'category',
    '1stFlrSF': int,
    '2ndFlrSF': int,
    'LowQualFinSF': int,
    'GrLivArea': int,
    'BsmtFullBath': int,
    'BsmtHalfBath': int,
    'FullBath': int,
    'HalfBath': int,
    'BedroomAbvGr': int,
    'KitchenAbvGr': int,
    'KitchenQual': 'category',
    'TotRmsAbvGrd': int,
    'Functional': 'category',
    'Fireplaces': int,
    'FireplaceQu': 'category',
    'GarageType': 'category',
    'GarageYrBlt': object,
    'GarageFinish': 'category',
    'GarageCars': int,
    'GarageArea': int,
    'GarageQual': 'category',
    'GarageCond': 'category',
    'PavedDrive': 'category',
    'WoodDeckSF': int,
    'OpenPorchSF': int,
    'EnclosedPorch': int,
    '3SsnPorch': int,
    'ScreenPorch': int,
    'PoolArea': int,
    'PoolQC': 'category',
    'Fence': 'category',
    'MiscFeature': 'category',
    'MiscVal': int,
    'MoSold': int,
    'YrSold': int,
    'SaleType': 'category',
    'SaleCondition': 'category'
}

# Convert columns to their correct data types in df_train and df_test
for column, dtype in correct_dtypes.items():
    if column in df_train.columns:
        if dtype == pd.to_datetime:
            df_train[column] = pd.to_datetime(df_train[column],format='%Y')
            #df_test[column] = pd.to_datetime(df_test[column])
        else:
            df_train[column] = df_train[column].astype(dtype)
            #df_test[column] = df_test[column].astype(dtype)

# Print the updated dataframe with data types
print(df_train.dtypes)
#print(df_test.dtypes)

# save it to a file
(df_train.dtypes).to_csv("dtypes_train_clean.csv")
#(df_test.dtypes).to_csv("dtypes_test_clean.csv")

Id                  int32
MSSubClass       category
MSZoning         category
LotFrontage       float64
LotArea             int32
                   ...   
MoSold              int32
YrSold              int32
SaleType         category
SaleCondition    category
SalePrice           int64
Length: 81, dtype: object


df_train[date_columns]


categorical_columns = ['OverallQual', 'BsmtQual', 'KitchenQual']
numerical_columns = ['MasVnrArea', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd',
                     'Fireplaces', 'GarageCars', 'GarageArea', 'SalePrice']
date_columns = ['YearBuilt', 'YearRemodAdd','GarageYrBlt']


# Check for missing values in each column
missing_values = df_train.isnull().sum()
print("Missing Values:")
print(missing_values)
print()

Missing Values:
Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64


# Set categories for BsmtQual
df_train['BsmtQual'] = df_train['BsmtQual'].cat.add_categories('No Basement')

# Fill missing values in BsmtQual with "No Basement"
df_train['BsmtQual'] = df_train['BsmtQual'].fillna('No Basement')


df_train['BsmtQual'].value_counts()

TA             649
Gd             618
Ex             121
No Basement     37
Fa              35
Name: BsmtQual, dtype: int64


df_train_reloaded = pd.read_csv(r'C:\Users\User\Desktop\GitHub-projects\projects\Data-Dives-Projects-Unleashed\dataSets\course 1\week 1\datasets\House_Price_Prediction\train _price_house.csv')


# Filter the rows where GarageYrBlt is missing
missing_garage = df_train_reloaded[df_train['GarageYrBlt'].isnull()]

# Count the unique values in GarageQual for the missing GarageYrBlt rows
garage_qual_table = df_train_reloaded[df_train_reloaded['GarageYrBlt'].isnull()][['GarageYrBlt','GarageQual']]

garage_qual_table


df_train


df_train_garage_impute = df_train.copy()
df_train_garage_impute


# Impute missing values in GarageYrBlt based on YearBuilt
df_train_garage_impute.loc[df_train_garage_impute['GarageYrBlt'].isna(), 'GarageYrBlt'] =df_train_garage_impute['YearBuilt']

# Verify the imputation
missing_garage_yrblt = df_train_garage_impute['GarageYrBlt'].isna().sum()
print(f"Missing values in GarageYrBlt after imputation: {missing_garage_yrblt}")

Missing values in GarageYrBlt after imputation: 0


df_train_garage_impute


df_train_garage_unknown = df_train.copy()
df_train_garage_unknown


# Replace missing values in GarageYrBlt with "Unknown"
df_train_garage_unknown.loc[df_train_garage_unknown['GarageYrBlt'].isna(), 'GarageYrBlt'] = df_train_garage_unknown['YearBuilt']

# Verify the replacement
missing_garage_yrblt = df_train_garage_unknown['GarageYrBlt'].isna().sum()
print(f"Missing values in GarageYrBlt after replacement: {missing_garage_yrblt}")

Missing values in GarageYrBlt after replacement: 0


df_train_garage_unknown


# Drop the rows with missing values in MasVnrArea column
df_train = df_train.dropna(subset=['MasVnrArea'])
df_train_garage_impute = df_train_garage_impute.dropna(subset=['MasVnrArea'])
df_train_garage_unknown = df_train_garage_unknown.dropna(subset=['MasVnrArea'])


# Check for inconsistencies or errors in categorical columns
for column in categorical_columns:
    unique_values = df_train[column].unique()
    print(f"Column: {column}")
    print(f"   Unique values: {unique_values}")
    print()

Column: OverallQual
   Unique values: [7, 6, 8, 5, 9, 4, 10, 3, 1, 2]
Categories (10, int64): [1 < 2 < 3 < 4 ... 7 < 8 < 9 < 10]

Column: BsmtQual
   Unique values: ['Gd', 'TA', 'Ex', 'No Basement', 'Fa']
Categories (5, object): ['Ex', 'Fa', 'Gd', 'TA', 'No Basement']

Column: KitchenQual
   Unique values: ['Gd', 'TA', 'Ex', 'Fa']
Categories (4, object): ['Ex', 'Fa', 'Gd', 'TA']


# Identify potential outliers in numerical columns
for column in numerical_columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df_train[column])
    plt.title(f"Boxplot of {column}")
    plt.show()


from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import RobustScaler

# Identify and remove outliers using Isolation Forest
outlier_detector = IsolationForest(contamination='auto')
outliers = outlier_detector.fit_predict(df_train_garage_impute[numerical_columns])
df_train_outliers_1 = df_train_garage_impute[outliers == 1]  # Keep only non-outlier instances

# Scale the numerical columns using RobustScaler
scaler = RobustScaler()
df_train_outliers_1[numerical_columns] = scaler.fit_transform(df_train_outliers_1[numerical_columns])


df_train_outliers_1


from scipy.stats import mstats

def handle_outliers(data, column):
    # Calculate the lower and upper thresholds using Winsorization
    data[column] = mstats.winsorize(data[column], limits=[0.05, 0.05])


df_train_outliers_2 = df_train_garage_impute.copy()  # Create a copy of the original data

for column in numerical_columns:
    handle_outliers(df_train_outliers_2, column)


df_train_outliers_2


# Check for inconsistencies or errors in date columns
for column in date_columns:
    unique_dates = sorted((df_train[column].unique()).tolist(), reverse=True)
    print(f"Column: {column}")
    print(f"Unique dates: {unique_dates}")
    print()

Column: YearBuilt
Unique dates: [2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998, 1997, 1996, 1995, 1994, 1993, 1992, 1991, 1990, 1989, 1988, 1987, 1986, 1985, 1984, 1983, 1982, 1981, 1980, 1979, 1978, 1977, 1976, 1975, 1974, 1973, 1972, 1971, 1970, 1969, 1968, 1967, 1966, 1965, 1964, 1963, 1962, 1961, 1960, 1959, 1958, 1957, 1956, 1955, 1954, 1953, 1952, 1951, 1950, 1949, 1948, 1947, 1946, 1945, 1942, 1941, 1940, 1939, 1938, 1937, 1936, 1935, 1934, 1932, 1931, 1930, 1929, 1928, 1927, 1926, 1925, 1924, 1923, 1922, 1921, 1920, 1919, 1918, 1917, 1916, 1915, 1914, 1913, 1912, 1911, 1910, 1908, 1906, 1905, 1904, 1900, 1898, 1893, 1892, 1890, 1885, 1882, 1880, 1875, 1872]

Column: YearRemodAdd
Unique dates: [2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998, 1997, 1996, 1995, 1994, 1993, 1992, 1991, 1990, 1989, 1988, 1987, 1986, 1985, 1984, 1983, 1982, 1981, 1980, 1979, 1978, 1977, 1976, 1975, 1974, 1973, 1972, 1971, 1970, 1969, 1968, 1967, 1966, 1965, 1964, 1963, 1962, 1961, 1960, 1959, 1958, 1957, 1956, 1955, 1954, 1953, 1952, 1951, 1950]

Column: GarageYrBlt
Unique dates: [2010.0, 2008.0, 2007.0, 2006.0, 2005.0, 2004.0, 2003.0, 2002.0, 2001.0, 2000.0, 1998.0, 1991.0, nan, 2009.0, 1999.0, 1997.0, 1996.0, 1995.0, 1994.0, 1993.0, 1992.0, 1990.0, 1989.0, 1988.0, 1987.0, 1986.0, 1985.0, 1984.0, 1983.0, 1982.0, 1981.0, 1980.0, 1979.0, 1978.0, 1977.0, 1976.0, 1975.0, 1974.0, 1973.0, 1972.0, 1971.0, 1970.0, 1969.0, 1968.0, 1967.0, 1966.0, 1965.0, 1964.0, 1963.0, 1962.0, 1961.0, 1960.0, 1959.0, 1958.0, 1957.0, 1956.0, 1955.0, 1954.0, 1953.0, 1952.0, 1951.0, 1950.0, 1949.0, 1948.0, 1947.0, 1946.0, 1945.0, 1942.0, 1941.0, 1940.0, 1939.0, 1938.0, 1937.0, 1936.0, 1935.0, 1934.0, 1933.0, 1932.0, 1931.0, 1930.0, 1929.0, 1928.0, 1927.0, 1926.0, 1925.0, 1924.0, 1923.0, 1922.0, 1921.0, 1920.0, 1918.0, 1916.0, 1915.0, 1914.0, 1910.0, 1908.0, 1906.0, 1900.0]


# Check for missing values in each date column
missing_dates = df_train_garage_unknown[date_columns].isnull().sum()
print("Missing Dates:")
print(missing_dates)
print()

Missing Dates:
YearBuilt       0
YearRemodAdd    0
GarageYrBlt     0
dtype: int64


# Check for valid dates in each date column
for column in date_columns:
    invalid_dates = df_train[column].apply(lambda x: isinstance(x, datetime)).sum()
    print(f"Column: {column}")
    print(f"Invalid dates: {invalid_dates}")
    print()


df_train[date_columns].dtypes

YearBuilt        int32
YearRemodAdd     int32
GarageYrBlt     object
dtype: object


df_train_garage_impute['GarageYrBlt'] = df_train_garage_impute['GarageYrBlt'].astype(int)
df_train_garage_unknown['GarageYrBlt'] = df_train_garage_unknown['GarageYrBlt'].astype(int)
df_train_outliers_1['GarageYrBlt'] = df_train_outliers_1['GarageYrBlt'].astype(int)
df_train_outliers_2['GarageYrBlt'] = df_train_outliers_2['GarageYrBlt'].astype(int)


print(f"The categorical columns are: {categorical_columns}")

The categorical columns are: ['OverallQual', 'BsmtQual', 'KitchenQual']


def get_categorical_columns(dataframes):
    categorical_columns_list = []
    for data in dataframes:
        categorical_columns = data.select_dtypes(include=[object, 'category']).columns
        categorical_columns_list.append(categorical_columns)
    return categorical_columns_list

dataframes = [df_train, df_train_garage_impute, df_train_garage_unknown, df_train_outliers_1, df_train_outliers_2]

categorical_columns_list = get_categorical_columns(dataframes)

print(f"The categorical columns in all the data frames are:")
categorical_columns_list

The categorical columns in all the data frames are:

[Index(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
        'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
        'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
        'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
        'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
        'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
        'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond',
        'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType',
        'SaleCondition'],
       dtype='object'),
 Index(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
        'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
        'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
        'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
        'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
        'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
        'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond',
        'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType',
        'SaleCondition'],
       dtype='object'),
 Index(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
        'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
        'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
        'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
        'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
        'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
        'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond',
        'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType',
        'SaleCondition'],
       dtype='object'),
 Index(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
        'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
        'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
        'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
        'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
        'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
        'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond',
        'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType',
        'SaleCondition'],
       dtype='object'),
 Index(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
        'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
        'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
        'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
        'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
        'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
        'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond',
        'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType',
        'SaleCondition'],
       dtype='object')]


df_train[categorical_columns[2]]

0       Gd
1       TA
2       Gd
3       Gd
4       Gd
        ..
1455    TA
1456    TA
1457    Gd
1458    Gd
1459    TA
Name: KitchenQual, Length: 1452, dtype: category
Categories (4, object): ['Ex', 'Fa', 'Gd', 'TA']


from sklearn.preprocessing import LabelEncoder

def label_encode_dataframes(dataframes, column_name):
    # Create an instance of LabelEncoder
    label_encoder = LabelEncoder()

    for df in dataframes:
        # Fit and transform the specified column with LabelEncoder
        df[column_name] = label_encoder.fit_transform(df[column_name])
    
    return dataframes

# Define the column to be label encoded
column_name = 'OverallQual'

# Apply label encoding to the original dataframes
encoded_dataframes = label_encode_dataframes(dataframes, column_name)


from sklearn.preprocessing import OrdinalEncoder

# Define the order of the categories
categories = [['No Basement', 'Po', 'Fa', 'TA', 'Gd', 'Ex']]

# Create an instance of the OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories=categories)

# Perform ordinal encoding on the 'BsmtQual' column
for df in encoded_dataframes:
    df['BsmtQual'] = ordinal_encoder.fit_transform(df[['BsmtQual']])


df_train["BsmtQual"]

0       4.0
1       4.0
2       4.0
3       3.0
4       4.0
       ... 
1455    4.0
1456    4.0
1457    3.0
1458    3.0
1459    3.0
Name: BsmtQual, Length: 1452, dtype: float64


df_train_garage_impute["BsmtQual"].value_counts()

3.0    648
4.0    612
5.0    120
0.0     37
2.0     35
Name: BsmtQual, dtype: int64


from sklearn.preprocessing import OrdinalEncoder

# Define the order of the categories for KitchenQual
kitchen_qual_categories = [['Po', 'Fa', 'TA', 'Gd', 'Ex']]

# Create an instance of the OrdinalEncoder for KitchenQual
kitchen_qual_encoder = OrdinalEncoder(categories=kitchen_qual_categories)

# Apply ordinal encoding to the 'KitchenQual' column in each data frame
for df in encoded_dataframes:
    df['KitchenQual'] = kitchen_qual_encoder.fit_transform(df[['KitchenQual']])


# Define the list of data frame names
dataframe_names = ['df_train', 'df_train_garage_impute', 'df_train_garage_unknown', 'df_train_outliers_1', 'df_train_outliers_2']

# Iterate through each data frame and evaluate the encoded features
for df, df_name in zip(encoded_dataframes, dataframe_names):
    print(f"Data Frame: {df_name}")
    print("------------------------"*3)
    # Print the unique values and their counts for each encoded feature
    for column in categorical_columns:
        print(f"Encoded Feature: {column}")
        print(df[column].value_counts())
        print()
    print("------------------------")

Data Frame: df_train
------------------------------------------------------------------------
Encoded Feature: OverallQual
4    397
5    372
6    315
7    167
3    116
8     43
2     20
9     17
1      3
0      2
Name: OverallQual, dtype: int64

Encoded Feature: BsmtQual
3.0    648
4.0    612
5.0    120
0.0     37
2.0     35
Name: BsmtQual, dtype: int64

Encoded Feature: KitchenQual
2.0    734
3.0    580
4.0     99
1.0     39
Name: KitchenQual, dtype: int64

------------------------
Data Frame: df_train_garage_impute
------------------------------------------------------------------------
Encoded Feature: OverallQual
4    397
5    372
6    315
7    167
3    116
8     43
2     20
9     17
1      3
0      2
Name: OverallQual, dtype: int64

Encoded Feature: BsmtQual
3.0    648
4.0    612
5.0    120
0.0     37
2.0     35
Name: BsmtQual, dtype: int64

Encoded Feature: KitchenQual
2.0    734
3.0    580
4.0     99
1.0     39
Name: KitchenQual, dtype: int64

------------------------
Data Frame: df_train_garage_unknown
------------------------------------------------------------------------
Encoded Feature: OverallQual
4    397
5    372
6    315
7    167
3    116
8     43
2     20
9     17
1      3
0      2
Name: OverallQual, dtype: int64

Encoded Feature: BsmtQual
3.0    648
4.0    612
5.0    120
0.0     37
2.0     35
Name: BsmtQual, dtype: int64

Encoded Feature: KitchenQual
2.0    734
3.0    580
4.0     99
1.0     39
Name: KitchenQual, dtype: int64

------------------------
Data Frame: df_train_outliers_1
------------------------------------------------------------------------
Encoded Feature: OverallQual
2    364
3    351
4    288
5    123
1     95
6     17
0     16
7      2
Name: OverallQual, dtype: int64

Encoded Feature: BsmtQual
3.0    579
4.0    554
5.0     65
2.0     34
0.0     24
Name: BsmtQual, dtype: int64

Encoded Feature: KitchenQual
2.0    663
3.0    517
4.0     50
1.0     26
Name: KitchenQual, dtype: int64

------------------------
Data Frame: df_train_outliers_2
------------------------------------------------------------------------
Encoded Feature: OverallQual
4    397
5    372
6    315
7    167
3    116
8     43
2     20
9     17
1      3
0      2
Name: OverallQual, dtype: int64

Encoded Feature: BsmtQual
3.0    648
4.0    612
5.0    120
0.0     37
2.0     35
Name: BsmtQual, dtype: int64

Encoded Feature: KitchenQual
2.0    734
3.0    580
4.0     99
1.0     39
Name: KitchenQual, dtype: int64

------------------------


numerical_columns
len(numerical_columns)

10


df_train_garage_impute.select_dtypes(include=['int64', 'float64', 'int32']).columns

Index(['Id', 'LotFrontage', 'LotArea', 'OverallQual', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BsmtQual', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')


from sklearn.preprocessing import MinMaxScaler

# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Loop through each dataframe and apply normalization to numerical columns
for df in encoded_dataframes:
    # Apply normalization to numerical columns
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])


import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact, Dropdown

# Create a scatter plot function
def scatter_plot(column1, column2):
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=column1, y=column2, data=df_train)
    plt.xlabel(column1)
    plt.ylabel(column2)
    plt.title("Scatter Plot: {} vs {}".format(column1, column2))
    plt.show()

# Get the list of column names from df_train
column_names = df_train_outliers_1.columns.tolist()

# Create dropdown widgets for column selection
dropdown_column1 = Dropdown(options=column_names, description="Column 1:")
dropdown_column2 = Dropdown(options=column_names, description="Column 2:")

# Define the interaction function
@interact(column1=dropdown_column1, column2=dropdown_column2)
def plot_scatter(column1, column2):
    scatter_plot(column1, column2)


from sklearn.preprocessing import OrdinalEncoder

# Define the bin ranges
bins = [1870, 1900, 1920, 1940, 1960, 1980, 2000, 2020]

# Create the corresponding labels for each bin
labels = ['1870s-1890s', '1900s-1910s', '1920s-1930s', '1940s-1950s', '1960s-1970s', '1980s-1990s', '2000s-2020s']

# Define the order of the categories
category_order = ['1870s-1890s', '1900s-1910s', '1920s-1930s', '1940s-1950s', '1960s-1970s', '1980s-1990s', '2000s-2020s']

# List of dataframes
dataframes = [df_train, df_train_garage_impute, df_train_garage_unknown, df_train_outliers_1, df_train_outliers_2]

# Iterate over each dataframe
for df in dataframes:
    # Apply binning to the YearBuilt column
    df['YearBuilt_Binned'] = pd.cut(df_train_garage_impute['YearBuilt'], bins=bins, labels=labels, right=False)
    
    # Set the categories for the categorical column
    df['YearBuilt_Binned'] = df['YearBuilt_Binned'].cat.set_categories(category_order)

    # Fill missing values with a specific category
    df['YearBuilt_Binned'] = df['YearBuilt_Binned'].fillna('Unknown')

    # Create an instance of OrdinalEncoder with the specified category order
    encoder = OrdinalEncoder(categories=[category_order])

    # Fit-transform the column to perform ordinal encoding
    df['YearBuilt_Binned_encoded'] = encoder.fit_transform(df['YearBuilt_Binned'].values.reshape(-1, 1))

    # Print the modified dataframe
    print(df)

        Id MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0        1         60       RL         65.0     8450   Pave   NaN      Reg   
1        2         20       RL         80.0     9600   Pave   NaN      Reg   
2        3         60       RL         68.0    11250   Pave   NaN      IR1   
3        4         70       RL         60.0     9550   Pave   NaN      IR1   
4        5         60       RL         84.0    14260   Pave   NaN      IR1   
...    ...        ...      ...          ...      ...    ...   ...      ...   
1455  1456         60       RL         62.0     7917   Pave   NaN      Reg   
1456  1457         20       RL         85.0    13175   Pave   NaN      Reg   
1457  1458         70       RL         66.0     9042   Pave   NaN      Reg   
1458  1459         20       RL         68.0     9717   Pave   NaN      Reg   
1459  1460         20       RL         75.0     9937   Pave   NaN      Reg   

     LandContour Utilities  ...  Fence MiscFeature MiscVal MoSold YrSold  \
0            Lvl    AllPub  ...    NaN         NaN       0      2   2008   
1            Lvl    AllPub  ...    NaN         NaN       0      5   2007   
2            Lvl    AllPub  ...    NaN         NaN       0      9   2008   
3            Lvl    AllPub  ...    NaN         NaN       0      2   2006   
4            Lvl    AllPub  ...    NaN         NaN       0     12   2008   
...          ...       ...  ...    ...         ...     ...    ...    ...   
1455         Lvl    AllPub  ...    NaN         NaN       0      8   2007   
1456         Lvl    AllPub  ...  MnPrv         NaN       0      2   2010   
1457         Lvl    AllPub  ...  GdPrv        Shed    2500      5   2010   
1458         Lvl    AllPub  ...    NaN         NaN       0      4   2010   
1459         Lvl    AllPub  ...    NaN         NaN       0      6   2008   

     SaleType SaleCondition  SalePrice YearBuilt_Binned  \
0          WD        Normal   0.241078      2000s-2020s   
1          WD        Normal   0.203583      1960s-1970s   
2          WD        Normal   0.261908      2000s-2020s   
3          WD       Abnorml   0.145952      1900s-1910s   
4          WD        Normal   0.298709      2000s-2020s   
...       ...           ...        ...              ...   
1455       WD        Normal   0.194556      1980s-1990s   
1456       WD        Normal   0.243161      1960s-1970s   
1457       WD        Normal   0.321622      1940s-1950s   
1458       WD        Normal   0.148903      1940s-1950s   
1459       WD        Normal   0.156367      1960s-1970s   

      YearBuilt_Binned_encoded  
0                          6.0  
1                          4.0  
2                          6.0  
3                          1.0  
4                          6.0  
...                        ...  
1455                       5.0  
1456                       4.0  
1457                       3.0  
1458                       3.0  
1459                       4.0  

[1452 rows x 83 columns]
        Id MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0        1         60       RL         65.0     8450   Pave   NaN      Reg   
1        2         20       RL         80.0     9600   Pave   NaN      Reg   
2        3         60       RL         68.0    11250   Pave   NaN      IR1   
3        4         70       RL         60.0     9550   Pave   NaN      IR1   
4        5         60       RL         84.0    14260   Pave   NaN      IR1   
...    ...        ...      ...          ...      ...    ...   ...      ...   
1455  1456         60       RL         62.0     7917   Pave   NaN      Reg   
1456  1457         20       RL         85.0    13175   Pave   NaN      Reg   
1457  1458         70       RL         66.0     9042   Pave   NaN      Reg   
1458  1459         20       RL         68.0     9717   Pave   NaN      Reg   
1459  1460         20       RL         75.0     9937   Pave   NaN      Reg   

     LandContour Utilities  ...  Fence MiscFeature MiscVal MoSold YrSold  \
0            Lvl    AllPub  ...    NaN         NaN       0      2   2008   
1            Lvl    AllPub  ...    NaN         NaN       0      5   2007   
2            Lvl    AllPub  ...    NaN         NaN       0      9   2008   
3            Lvl    AllPub  ...    NaN         NaN       0      2   2006   
4            Lvl    AllPub  ...    NaN         NaN       0     12   2008   
...          ...       ...  ...    ...         ...     ...    ...    ...   
1455         Lvl    AllPub  ...    NaN         NaN       0      8   2007   
1456         Lvl    AllPub  ...  MnPrv         NaN       0      2   2010   
1457         Lvl    AllPub  ...  GdPrv        Shed    2500      5   2010   
1458         Lvl    AllPub  ...    NaN         NaN       0      4   2010   
1459         Lvl    AllPub  ...    NaN         NaN       0      6   2008   

     SaleType SaleCondition  SalePrice YearBuilt_Binned  \
0          WD        Normal   0.241078      2000s-2020s   
1          WD        Normal   0.203583      1960s-1970s   
2          WD        Normal   0.261908      2000s-2020s   
3          WD       Abnorml   0.145952      1900s-1910s   
4          WD        Normal   0.298709      2000s-2020s   
...       ...           ...        ...              ...   
1455       WD        Normal   0.194556      1980s-1990s   
1456       WD        Normal   0.243161      1960s-1970s   
1457       WD        Normal   0.321622      1940s-1950s   
1458       WD        Normal   0.148903      1940s-1950s   
1459       WD        Normal   0.156367      1960s-1970s   

      YearBuilt_Binned_encoded  
0                          6.0  
1                          4.0  
2                          6.0  
3                          1.0  
4                          6.0  
...                        ...  
1455                       5.0  
1456                       4.0  
1457                       3.0  
1458                       3.0  
1459                       4.0  

[1452 rows x 83 columns]
        Id MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0        1         60       RL         65.0     8450   Pave   NaN      Reg   
1        2         20       RL         80.0     9600   Pave   NaN      Reg   
2        3         60       RL         68.0    11250   Pave   NaN      IR1   
3        4         70       RL         60.0     9550   Pave   NaN      IR1   
4        5         60       RL         84.0    14260   Pave   NaN      IR1   
...    ...        ...      ...          ...      ...    ...   ...      ...   
1455  1456         60       RL         62.0     7917   Pave   NaN      Reg   
1456  1457         20       RL         85.0    13175   Pave   NaN      Reg   
1457  1458         70       RL         66.0     9042   Pave   NaN      Reg   
1458  1459         20       RL         68.0     9717   Pave   NaN      Reg   
1459  1460         20       RL         75.0     9937   Pave   NaN      Reg   

     LandContour Utilities  ...  Fence MiscFeature MiscVal MoSold YrSold  \
0            Lvl    AllPub  ...    NaN         NaN       0      2   2008   
1            Lvl    AllPub  ...    NaN         NaN       0      5   2007   
2            Lvl    AllPub  ...    NaN         NaN       0      9   2008   
3            Lvl    AllPub  ...    NaN         NaN       0      2   2006   
4            Lvl    AllPub  ...    NaN         NaN       0     12   2008   
...          ...       ...  ...    ...         ...     ...    ...    ...   
1455         Lvl    AllPub  ...    NaN         NaN       0      8   2007   
1456         Lvl    AllPub  ...  MnPrv         NaN       0      2   2010   
1457         Lvl    AllPub  ...  GdPrv        Shed    2500      5   2010   
1458         Lvl    AllPub  ...    NaN         NaN       0      4   2010   
1459         Lvl    AllPub  ...    NaN         NaN       0      6   2008   

     SaleType SaleCondition  SalePrice YearBuilt_Binned  \
0          WD        Normal   0.241078      2000s-2020s   
1          WD        Normal   0.203583      1960s-1970s   
2          WD        Normal   0.261908      2000s-2020s   
3          WD       Abnorml   0.145952      1900s-1910s   
4          WD        Normal   0.298709      2000s-2020s   
...       ...           ...        ...              ...   
1455       WD        Normal   0.194556      1980s-1990s   
1456       WD        Normal   0.243161      1960s-1970s   
1457       WD        Normal   0.321622      1940s-1950s   
1458       WD        Normal   0.148903      1940s-1950s   
1459       WD        Normal   0.156367      1960s-1970s   

      YearBuilt_Binned_encoded  
0                          6.0  
1                          4.0  
2                          6.0  
3                          1.0  
4                          6.0  
...                        ...  
1455                       5.0  
1456                       4.0  
1457                       3.0  
1458                       3.0  
1459                       4.0  

[1452 rows x 83 columns]
        Id MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0        1         60       RL         65.0     8450   Pave   NaN      Reg   
1        2         20       RL         80.0     9600   Pave   NaN      Reg   
2        3         60       RL         68.0    11250   Pave   NaN      IR1   
3        4         70       RL         60.0     9550   Pave   NaN      IR1   
4        5         60       RL         84.0    14260   Pave   NaN      IR1   
...    ...        ...      ...          ...      ...    ...   ...      ...   
1454  1455         20       FV         62.0     7500   Pave  Pave      Reg   
1455  1456         60       RL         62.0     7917   Pave   NaN      Reg   
1456  1457         20       RL         85.0    13175   Pave   NaN      Reg   
1458  1459         20       RL         68.0     9717   Pave   NaN      Reg   
1459  1460         20       RL         75.0     9937   Pave   NaN      Reg   

     LandContour Utilities  ...  Fence MiscFeature MiscVal MoSold YrSold  \
0            Lvl    AllPub  ...    NaN         NaN       0      2   2008   
1            Lvl    AllPub  ...    NaN         NaN       0      5   2007   
2            Lvl    AllPub  ...    NaN         NaN       0      9   2008   
3            Lvl    AllPub  ...    NaN         NaN       0      2   2006   
4            Lvl    AllPub  ...    NaN         NaN       0     12   2008   
...          ...       ...  ...    ...         ...     ...    ...    ...   
1454         Lvl    AllPub  ...    NaN         NaN       0     10   2009   
1455         Lvl    AllPub  ...    NaN         NaN       0      8   2007   
1456         Lvl    AllPub  ...  MnPrv         NaN       0      2   2010   
1458         Lvl    AllPub  ...    NaN         NaN       0      4   2010   
1459         Lvl    AllPub  ...    NaN         NaN       0      6   2008   

     SaleType SaleCondition  SalePrice YearBuilt_Binned  \
0          WD        Normal   0.478014      2000s-2020s   
1          WD        Normal   0.401418      1960s-1970s   
2          WD        Normal   0.520567      2000s-2020s   
3          WD       Abnorml   0.283688      1900s-1910s   
4          WD        Normal   0.595745      2000s-2020s   
...       ...           ...        ...              ...   
1454       WD        Normal   0.411348      2000s-2020s   
1455       WD        Normal   0.382979      1980s-1990s   
1456       WD        Normal   0.482270      1960s-1970s   
1458       WD        Normal   0.289716      1940s-1950s   
1459       WD        Normal   0.304965      1960s-1970s   

      YearBuilt_Binned_encoded  
0                          6.0  
1                          4.0  
2                          6.0  
3                          1.0  
4                          6.0  
...                        ...  
1454                       6.0  
1455                       5.0  
1456                       4.0  
1458                       3.0  
1459                       4.0  

[1256 rows x 83 columns]
        Id MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0        1         60       RL         65.0     8450   Pave   NaN      Reg   
1        2         20       RL         80.0     9600   Pave   NaN      Reg   
2        3         60       RL         68.0    11250   Pave   NaN      IR1   
3        4         70       RL         60.0     9550   Pave   NaN      IR1   
4        5         60       RL         84.0    14260   Pave   NaN      IR1   
...    ...        ...      ...          ...      ...    ...   ...      ...   
1455  1456         60       RL         62.0     7917   Pave   NaN      Reg   
1456  1457         20       RL         85.0    13175   Pave   NaN      Reg   
1457  1458         70       RL         66.0     9042   Pave   NaN      Reg   
1458  1459         20       RL         68.0     9717   Pave   NaN      Reg   
1459  1460         20       RL         75.0     9937   Pave   NaN      Reg   

     LandContour Utilities  ...  Fence MiscFeature MiscVal MoSold YrSold  \
0            Lvl    AllPub  ...    NaN         NaN       0      2   2008   
1            Lvl    AllPub  ...    NaN         NaN       0      5   2007   
2            Lvl    AllPub  ...    NaN         NaN       0      9   2008   
3            Lvl    AllPub  ...    NaN         NaN       0      2   2006   
4            Lvl    AllPub  ...    NaN         NaN       0     12   2008   
...          ...       ...  ...    ...         ...     ...    ...    ...   
1455         Lvl    AllPub  ...    NaN         NaN       0      8   2007   
1456         Lvl    AllPub  ...  MnPrv         NaN       0      2   2010   
1457         Lvl    AllPub  ...  GdPrv        Shed    2500      5   2010   
1458         Lvl    AllPub  ...    NaN         NaN       0      4   2010   
1459         Lvl    AllPub  ...    NaN         NaN       0      6   2008   

     SaleType SaleCondition  SalePrice YearBuilt_Binned  \
0          WD        Normal   0.506303      2000s-2020s   
1          WD        Normal   0.392857      1960s-1970s   
2          WD        Normal   0.569328      2000s-2020s   
3          WD       Abnorml   0.218487      1900s-1910s   
4          WD        Normal   0.680672      2000s-2020s   
...       ...           ...        ...              ...   
1455       WD        Normal   0.365546      1980s-1990s   
1456       WD        Normal   0.512605      1960s-1970s   
1457       WD        Normal   0.750000      1940s-1950s   
1458       WD        Normal   0.227416      1940s-1950s   
1459       WD        Normal   0.250000      1960s-1970s   

      YearBuilt_Binned_encoded  
0                          6.0  
1                          4.0  
2                          6.0  
3                          1.0  
4                          6.0  
...                        ...  
1455                       5.0  
1456                       4.0  
1457                       3.0  
1458                       3.0  
1459                       4.0  

[1452 rows x 83 columns]


i = 0
for df in dataframes:
    print(f'DataFrame Name: {dataframe_names[i]}')
    print(df['YearBuilt_Binned_encoded'].value_counts())
    print("_"*50)
    i +=1

DataFrame Name: df_train
6.0    381
4.0    363
3.0    245
5.0    224
2.0    151
1.0     73
0.0     15
Name: YearBuilt_Binned_encoded, dtype: int64
__________________________________________________
DataFrame Name: df_train_garage_impute
6.0    381
4.0    363
3.0    245
5.0    224
2.0    151
1.0     73
0.0     15
Name: YearBuilt_Binned_encoded, dtype: int64
__________________________________________________
DataFrame Name: df_train_garage_unknown
6.0    381
4.0    363
3.0    245
5.0    224
2.0    151
1.0     73
0.0     15
Name: YearBuilt_Binned_encoded, dtype: int64
__________________________________________________
DataFrame Name: df_train_outliers_1
6.0    320
4.0    320
3.0    222
5.0    195
2.0    130
1.0     60
0.0      9
Name: YearBuilt_Binned_encoded, dtype: int64
__________________________________________________
DataFrame Name: df_train_outliers_2
6.0    381
4.0    363
3.0    245
5.0    224
2.0    151
1.0     73
0.0     15
Name: YearBuilt_Binned_encoded, dtype: int64
__________________________________________________


from sklearn.model_selection import train_test_split

# Define the list of data frames
dataframes = [df_train, df_train_garage_impute, df_train_garage_unknown, df_train_outliers_1, df_train_outliers_2]

# Define the names of the data frames
dataframes_names = ['df_train', 'df_train_garage_impute', 'df_train_garage_unknown', 'df_train_outliers_1', 'df_train_outliers_2']

# Perform data splitting on each data frame
for df, df_name in zip(dataframes, dataframes_names):
    # Separate the features (X) and the target variable (y)
    X = df.drop(['SalePrice', 'YearBuilt_Binned'], axis=1)
    y = df['SalePrice']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Assign custom variable names based on the data frame name
    train_name = f"X_train_{df_name}"
    test_name = f"X_test_{df_name}"
    target_train_name = f"y_train_{df_name}"
    target_test_name = f"y_test_{df_name}"
    
    # Assign the data splits to the corresponding variables
    globals()[train_name] = X_train
    globals()[test_name] = X_test
    globals()[target_train_name] = y_train
    globals()[target_test_name] = y_test

    # Print the shapes of the resulting datasets
    print(f"Training set shape ({train_name}):", X_train.shape, y_train.shape)
    print(f"Testing set shape ({test_name}):", X_test.shape, y_test.shape)
    print("-----------------------------------")

Training set shape (X_train_df_train): (1161, 81) (1161,)
Testing set shape (X_test_df_train): (291, 81) (291,)
-----------------------------------
Training set shape (X_train_df_train_garage_impute): (1161, 81) (1161,)
Testing set shape (X_test_df_train_garage_impute): (291, 81) (291,)
-----------------------------------
Training set shape (X_train_df_train_garage_unknown): (1161, 81) (1161,)
Testing set shape (X_test_df_train_garage_unknown): (291, 81) (291,)
-----------------------------------
Training set shape (X_train_df_train_outliers_1): (1004, 81) (1004,)
Testing set shape (X_test_df_train_outliers_1): (252, 81) (252,)
-----------------------------------
Training set shape (X_train_df_train_outliers_2): (1161, 81) (1161,)
Testing set shape (X_test_df_train_outliers_2): (291, 81) (291,)
-----------------------------------


X_test_df_train

	count	mean	std	min	25%	50%	75%	max
Id	1460.0	730.500000	421.610009	1.0	365.75	730.5	1095.25	1460.0
MSSubClass	1460.0	56.897260	42.300571	20.0	20.00	50.0	70.00	190.0
LotFrontage	1201.0	70.049958	24.284752	21.0	59.00	69.0	80.00	313.0
LotArea	1460.0	10516.828082	9981.264932	1300.0	7553.50	9478.5	11601.50	215245.0
OverallQual	1460.0	6.099315	1.382997	1.0	5.00	6.0	7.00	10.0
OverallCond	1460.0	5.575342	1.112799	1.0	5.00	5.0	6.00	9.0
YearBuilt	1460.0	1971.267808	30.202904	1872.0	1954.00	1973.0	2000.00	2010.0
YearRemodAdd	1460.0	1984.865753	20.645407	1950.0	1967.00	1994.0	2004.00	2010.0
MasVnrArea	1452.0	103.685262	181.066207	0.0	0.00	0.0	166.00	1600.0
BsmtFinSF1	1460.0	443.639726	456.098091	0.0	0.00	383.5	712.25	5644.0
BsmtFinSF2	1460.0	46.549315	161.319273	0.0	0.00	0.0	0.00	1474.0
BsmtUnfSF	1460.0	567.240411	441.866955	0.0	223.00	477.5	808.00	2336.0
TotalBsmtSF	1460.0	1057.429452	438.705324	0.0	795.75	991.5	1298.25	6110.0
1stFlrSF	1460.0	1162.626712	386.587738	334.0	882.00	1087.0	1391.25	4692.0
2ndFlrSF	1460.0	346.992466	436.528436	0.0	0.00	0.0	728.00	2065.0
LowQualFinSF	1460.0	5.844521	48.623081	0.0	0.00	0.0	0.00	572.0
GrLivArea	1460.0	1515.463699	525.480383	334.0	1129.50	1464.0	1776.75	5642.0
BsmtFullBath	1460.0	0.425342	0.518911	0.0	0.00	0.0	1.00	3.0
BsmtHalfBath	1460.0	0.057534	0.238753	0.0	0.00	0.0	0.00	2.0
FullBath	1460.0	1.565068	0.550916	0.0	1.00	2.0	2.00	3.0
HalfBath	1460.0	0.382877	0.502885	0.0	0.00	0.0	1.00	2.0
BedroomAbvGr	1460.0	2.866438	0.815778	0.0	2.00	3.0	3.00	8.0
KitchenAbvGr	1460.0	1.046575	0.220338	0.0	1.00	1.0	1.00	3.0
TotRmsAbvGrd	1460.0	6.517808	1.625393	2.0	5.00	6.0	7.00	14.0
Fireplaces	1460.0	0.613014	0.644666	0.0	0.00	1.0	1.00	3.0
GarageYrBlt	1379.0	1978.506164	24.689725	1900.0	1961.00	1980.0	2002.00	2010.0
GarageCars	1460.0	1.767123	0.747315	0.0	1.00	2.0	2.00	4.0
GarageArea	1460.0	472.980137	213.804841	0.0	334.50	480.0	576.00	1418.0
WoodDeckSF	1460.0	94.244521	125.338794	0.0	0.00	0.0	168.00	857.0
OpenPorchSF	1460.0	46.660274	66.256028	0.0	0.00	25.0	68.00	547.0
EnclosedPorch	1460.0	21.954110	61.119149	0.0	0.00	0.0	0.00	552.0
3SsnPorch	1460.0	3.409589	29.317331	0.0	0.00	0.0	0.00	508.0
ScreenPorch	1460.0	15.060959	55.757415	0.0	0.00	0.0	0.00	480.0
PoolArea	1460.0	2.758904	40.177307	0.0	0.00	0.0	0.00	738.0
MiscVal	1460.0	43.489041	496.123024	0.0	0.00	0.0	0.00	15500.0
MoSold	1460.0	6.321918	2.703626	1.0	5.00	6.0	8.00	12.0
YrSold	1460.0	2007.815753	1.328095	2006.0	2007.00	2008.0	2009.00	2010.0
SalePrice	1460.0	180921.195890	79442.502883	34900.0	129975.00	163000.0	214000.00	755000.0

	count	mean	std	min	25%	50%	75%	max
Id	1459.0	2190.000000	421.321334	1461.0	1825.50	2190.0	2554.50	2919.0
MSSubClass	1459.0	57.378341	42.746880	20.0	20.00	50.0	70.00	190.0
LotFrontage	1232.0	68.580357	22.376841	21.0	58.00	67.0	80.00	200.0
LotArea	1459.0	9819.161069	4955.517327	1470.0	7391.00	9399.0	11517.50	56600.0
OverallQual	1459.0	6.078821	1.436812	1.0	5.00	6.0	7.00	10.0
OverallCond	1459.0	5.553804	1.113740	1.0	5.00	5.0	6.00	9.0
YearBuilt	1459.0	1971.357779	30.390071	1879.0	1953.00	1973.0	2001.00	2010.0
YearRemodAdd	1459.0	1983.662783	21.130467	1950.0	1963.00	1992.0	2004.00	2010.0
MasVnrArea	1444.0	100.709141	177.625900	0.0	0.00	0.0	164.00	1290.0
BsmtFinSF1	1458.0	439.203704	455.268042	0.0	0.00	350.5	753.50	4010.0
BsmtFinSF2	1458.0	52.619342	176.753926	0.0	0.00	0.0	0.00	1526.0
BsmtUnfSF	1458.0	554.294925	437.260486	0.0	219.25	460.0	797.75	2140.0
TotalBsmtSF	1458.0	1046.117970	442.898624	0.0	784.00	988.0	1305.00	5095.0
1stFlrSF	1459.0	1156.534613	398.165820	407.0	873.50	1079.0	1382.50	5095.0
2ndFlrSF	1459.0	325.967786	420.610226	0.0	0.00	0.0	676.00	1862.0
LowQualFinSF	1459.0	3.543523	44.043251	0.0	0.00	0.0	0.00	1064.0
GrLivArea	1459.0	1486.045922	485.566099	407.0	1117.50	1432.0	1721.00	5095.0
BsmtFullBath	1457.0	0.434454	0.530648	0.0	0.00	0.0	1.00	3.0
BsmtHalfBath	1457.0	0.065202	0.252468	0.0	0.00	0.0	0.00	2.0
FullBath	1459.0	1.570939	0.555190	0.0	1.00	2.0	2.00	4.0
HalfBath	1459.0	0.377656	0.503017	0.0	0.00	0.0	1.00	2.0
BedroomAbvGr	1459.0	2.854010	0.829788	0.0	2.00	3.0	3.00	6.0
KitchenAbvGr	1459.0	1.042495	0.208472	0.0	1.00	1.0	1.00	2.0
TotRmsAbvGrd	1459.0	6.385195	1.508895	3.0	5.00	6.0	7.00	15.0
Fireplaces	1459.0	0.581220	0.647420	0.0	0.00	0.0	1.00	4.0
GarageYrBlt	1381.0	1977.721217	26.431175	1895.0	1959.00	1979.0	2002.00	2207.0
GarageCars	1458.0	1.766118	0.775945	0.0	1.00	2.0	2.00	5.0
GarageArea	1458.0	472.768861	217.048611	0.0	318.00	480.0	576.00	1488.0
WoodDeckSF	1459.0	93.174777	127.744882	0.0	0.00	0.0	168.00	1424.0
OpenPorchSF	1459.0	48.313914	68.883364	0.0	0.00	28.0	72.00	742.0
EnclosedPorch	1459.0	24.243317	67.227765	0.0	0.00	0.0	0.00	1012.0
3SsnPorch	1459.0	1.794380	20.207842	0.0	0.00	0.0	0.00	360.0
ScreenPorch	1459.0	17.064428	56.609763	0.0	0.00	0.0	0.00	576.0
PoolArea	1459.0	1.744345	30.491646	0.0	0.00	0.0	0.00	800.0
MiscVal	1459.0	58.167923	630.806978	0.0	0.00	0.0	0.00	17000.0
MoSold	1459.0	6.104181	2.722432	1.0	4.00	6.0	8.00	12.0
YrSold	1459.0	2007.769705	1.301740	2006.0	2007.00	2008.0	2009.00	2010.0

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	0	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	0	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	0	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	0	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	0	12	2008	WD	Normal	250000
5	6	50	RL	85.0	14115	Pave	NaN	IR1	Lvl	AllPub	...	NaN	MnPrv	Shed	700	10	2009	WD	Normal	143000
6	7	20	RL	75.0	10084	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	0	8	2007	WD	Normal	307000
7	8	60	RL	NaN	10382	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	Shed	350	11	2009	WD	Normal	200000
8	9	50	RM	51.0	6120	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	0	4	2008	WD	Abnorml	129900
9	10	190	RL	50.0	7420	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	0	1	2008	WD	Normal	118000

	OverallQual	YearBuilt	YearRemodAdd	MasVnrArea	BsmtQual	TotalBsmtSF	1stFlrSF	GrLivArea	FullBath	KitchenQual	TotRmsAbvGrd	Fireplaces	GarageYrBlt	GarageCars	GarageArea	SalePrice
0	7	2003	2003	196.0	Gd	856	856	1710	2	Gd	8	0	2003.0	2	548	208500
1	6	1976	1976	0.0	Gd	1262	1262	1262	2	TA	6	1	1976.0	2	460	181500
2	7	2001	2002	162.0	Gd	920	920	1786	2	Gd	6	1	2001.0	2	608	223500
3	7	1915	1970	0.0	TA	756	961	1717	1	Gd	7	1	1998.0	3	642	140000
4	8	2000	2000	350.0	Gd	1145	1145	2198	2	Gd	9	1	2000.0	3	836	250000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1455	6	1999	2000	0.0	Gd	953	953	1647	2	TA	7	1	1999.0	2	460	175000
1456	6	1978	1988	119.0	Gd	1542	2073	2073	2	TA	7	2	1978.0	2	500	210000
1457	7	1941	2006	0.0	TA	1152	1188	2340	2	Gd	9	2	1941.0	1	252	266500
1458	5	1950	1996	0.0	TA	1078	1078	1078	1	Gd	5	0	1950.0	1	240	142125
1459	5	1965	1965	0.0	TA	1256	1256	1256	1	TA	6	0	1965.0	1	276	147500

	GarageYrBlt	GarageQual
39	NaN	NaN
48	NaN	NaN
78	NaN	NaN
88	NaN	NaN
89	NaN	NaN
...	...	...
1349	NaN	NaN
1407	NaN	NaN
1449	NaN	NaN
1450	NaN	NaN
1453	NaN	NaN

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolArea	PoolQC	Fence	MiscFeature	MiscVal	MoSold	YrSold	SaleType	SaleCondition	YearBuilt_Binned_encoded
1042	1043	120	RL	34.0	5381	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	8	2009	WD	Normal	6.0
1130	1131	50	RL	65.0	7804	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	MnPrv	NaN	0	12	2009	WD	Normal	2.0
1003	1004	90	RL	NaN	11500	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	6	2007	WD	Normal	4.0
1324	1325	20	RL	75.0	9986	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	2	2007	New	Partial	6.0
531	532	70	RM	60.0	6155	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	6	2008	WD	Normal	2.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
897	898	90	RL	64.0	7018	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	6	2009	WD	Alloca	4.0
1334	1335	160	RM	24.0	2368	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	5	2009	WD	Normal	4.0
1416	1417	190	RM	60.0	11340	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	4	2010	WD	Normal	0.0
718	719	60	RL	96.0	10542	Pave	NaN	Reg	Lvl	AllPub	...	0	NaN	NaN	NaN	0	8	2008	WD	Normal	5.0
255	256	60	RL	66.0	8738	Pave	NaN	IR1	Lvl	AllPub	...	0	NaN	NaN	NaN	0	2	2006	WD	Normal	5.0

House price prediction

About The Project

Objective¶

Steps Involved¶

Tools and Libraries¶

Dataset¶

Conclusion¶

About The Dataset

Column Descriptions¶

Here's the updated column descriptions with additional explanations:¶

1. Data Collection:

To get data sets from Kaggle using the Kaggle API, you need to follow these steps:¶

2. Data Loading, Cleaning and Exploration:

1. Data Loading and Cleaning

Import the Essential Libraries¶

Load the dataset using pandas¶

Check the shape of the dataset to get the number of rows and columns.¶

Display the first few rows of the dataset¶

check the data types of each column and save them¶

Identify any missing values in the dataset¶

2. Data Exploration:

knowing the dtypes of columns and how many non-null values are there:¶

Calculate basic statistics of all the columns and especially the target variable 'SalePrice'¶

Visualize the distribution of the target variable using a histogram or boxplot.¶

Interactive Histogram to Understand Train Data, Test Data, and Differences using Plotly and ipywidgets¶

Explore the relationship between numerical features and the target variable¶

Identify the categorical features¶

Visualize the distribution of categorical features¶

Visualize the distribution of categorical features¶

Explore the correlation between features¶

nvestigate any outliers or anomalies in the data¶

Analyze the distribution of missing values across the features¶

Explore the distribution of features by grouping them based on certain criteria¶

#¶

3. Data Preprocessing, Feature Selection and Engineering:

1. Handle Missing Values:

1. Delete columns with many missing values and no significant impact on SalePrice¶

2. convert the columns to the correct data type

3. Data Cleaning:

1. Define categorical, numerical, and date columns¶

1.check for missing values¶

1. Check for inconsistencies or errors in categorical columns¶

1. Identify potential outliers in numerical columns¶

1. Check for inconsistencies or errors in date columns¶

1. Check for missing values in each date column¶

1. Check for valid dates in each date column¶

3.5 Summary of Different Data Frames for Machine Learning Model Building

4. Feature Encoding:

1. Identify the categorical variables: Determine which columns in your dataset contain categorical data that needs to be encoded.¶

2. Choose an encoding technique & Apply the chosen encoding technique:¶

1. For the first column, OverallQual, the most suitable encoding technique is label encoding.¶

2. For the column BsmtQual, the best encoding technique to use is Ordinal Encoding.¶

3. For the "KitchenQual" column, the best encoding technique would be ordinal encoding.¶

3. Evaluate the encoded features in the data frames:¶

5. Feature Scaling:

1. Identify the numerical columns in your dataset that need to be scaled.¶

2. Choose the scaling technique that is most suitable for your dataset and problem.¶

6. Feature Engineering:

1. Interaction Features:¶

4. Binning and Bucketing:¶

7. Data Transformation:

8. Data Splitting:

4. Model Training and Evaluation:

5. Model Optimization:

6. Final Prediction and Interpretation:

Upcoming Developments: Exploring Model Training, Optimization, and Final Predictions

📞 Contact Me¶