In [1]:
#importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import random
In [2]:
#importing the dataset
data = pd.read_csv(r"C:\Users\T\Desktop\UCT Integrated Data Energy Analytics\Energy Data Sets\Delhi 5 Minute Electricity Demand for Forecasting Dataset\Delhi_electrictity_demand_data.csv")
In [3]:
data
Out[3]:
| Unnamed: 0 | datetime | Power demand | temp | dwpt | rhum | wdir | wspd | pres | year | month | day | hour | minute | moving_avg_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2021-01-01 00:30:00 | 2014.00 | 8.0 | 6.9 | 93.0 | 0.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 30 | NaN |
| 1 | 1 | 2021-01-01 00:35:00 | 2005.63 | 8.0 | 6.9 | 93.0 | 0.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 35 | NaN |
| 2 | 2 | 2021-01-01 00:40:00 | 1977.60 | 8.0 | 6.9 | 93.0 | 0.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 40 | 1999.076667 |
| 3 | 3 | 2021-01-01 00:45:00 | 1976.44 | 8.0 | 6.9 | 93.0 | 0.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 45 | 1986.556667 |
| 4 | 4 | 2021-01-01 00:50:00 | 1954.37 | 8.0 | 6.9 | 93.0 | 0.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 50 | 1969.470000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 393435 | 393435 | 2024-12-12 00:10:00 | 2146.84 | 12.3 | 6.8 | 69.0 | 269.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 10 | 2174.893333 |
| 393436 | 393436 | 2024-12-12 00:15:00 | 2116.66 | 12.3 | 6.8 | 69.0 | 269.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 15 | 2139.416667 |
| 393437 | 393437 | 2024-12-12 00:20:00 | 2082.77 | 12.3 | 6.8 | 69.0 | 269.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 20 | 2115.423333 |
| 393438 | 393438 | 2024-12-12 00:25:00 | 2059.17 | 12.3 | 6.8 | 69.0 | 269.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 25 | 2086.200000 |
| 393439 | 393439 | 2024-12-12 00:30:00 | 2049.66 | 12.3 | 6.8 | 69.0 | 269.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 30 | 2063.866667 |
393440 rows × 15 columns
In [4]:
#checking the basic informatrion about the data
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 393440 entries, 0 to 393439 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 393440 non-null int64 1 datetime 393440 non-null object 2 Power demand 393440 non-null float64 3 temp 393440 non-null float64 4 dwpt 393440 non-null float64 5 rhum 393440 non-null float64 6 wdir 392900 non-null float64 7 wspd 393440 non-null float64 8 pres 393440 non-null float64 9 year 393440 non-null int64 10 month 393440 non-null int64 11 day 393440 non-null int64 12 hour 393440 non-null int64 13 minute 393440 non-null int64 14 moving_avg_3 393438 non-null float64 dtypes: float64(8), int64(6), object(1) memory usage: 45.0+ MB
In [5]:
#changing the timestamp to a date time format
data['datetime'] = pd.to_datetime(data['datetime'])
In [6]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 393440 entries, 0 to 393439 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 393440 non-null int64 1 datetime 393440 non-null datetime64[ns] 2 Power demand 393440 non-null float64 3 temp 393440 non-null float64 4 dwpt 393440 non-null float64 5 rhum 393440 non-null float64 6 wdir 392900 non-null float64 7 wspd 393440 non-null float64 8 pres 393440 non-null float64 9 year 393440 non-null int64 10 month 393440 non-null int64 11 day 393440 non-null int64 12 hour 393440 non-null int64 13 minute 393440 non-null int64 14 moving_avg_3 393438 non-null float64 dtypes: datetime64[ns](1), float64(8), int64(6) memory usage: 45.0 MB
In [7]:
#dropping the first column which contains only numbers and not useful data
data = data.drop(columns=['Unnamed: 0'])
In [8]:
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 393440 entries, 0 to 393439 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 datetime 393440 non-null datetime64[ns] 1 Power demand 393440 non-null float64 2 temp 393440 non-null float64 3 dwpt 393440 non-null float64 4 rhum 393440 non-null float64 5 wdir 392900 non-null float64 6 wspd 393440 non-null float64 7 pres 393440 non-null float64 8 year 393440 non-null int64 9 month 393440 non-null int64 10 day 393440 non-null int64 11 hour 393440 non-null int64 12 minute 393440 non-null int64 13 moving_avg_3 393438 non-null float64 dtypes: datetime64[ns](1), float64(8), int64(5) memory usage: 42.0 MB
In [9]:
#setting the datetime to the index column
data = data.set_index('datetime')
In [10]:
data.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 393440 entries, 2021-01-01 00:30:00 to 2024-12-12 00:30:00 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Power demand 393440 non-null float64 1 temp 393440 non-null float64 2 dwpt 393440 non-null float64 3 rhum 393440 non-null float64 4 wdir 392900 non-null float64 5 wspd 393440 non-null float64 6 pres 393440 non-null float64 7 year 393440 non-null int64 8 month 393440 non-null int64 9 day 393440 non-null int64 10 hour 393440 non-null int64 11 minute 393440 non-null int64 12 moving_avg_3 393438 non-null float64 dtypes: float64(8), int64(5) memory usage: 42.0 MB
In [11]:
data
Out[11]:
| Power demand | temp | dwpt | rhum | wdir | wspd | pres | year | month | day | hour | minute | moving_avg_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime | |||||||||||||
| 2021-01-01 00:30:00 | 2014.00 | 8.0 | 6.9 | 93.0 | 0.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 30 | NaN |
| 2021-01-01 00:35:00 | 2005.63 | 8.0 | 6.9 | 93.0 | 0.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 35 | NaN |
| 2021-01-01 00:40:00 | 1977.60 | 8.0 | 6.9 | 93.0 | 0.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 40 | 1999.076667 |
| 2021-01-01 00:45:00 | 1976.44 | 8.0 | 6.9 | 93.0 | 0.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 45 | 1986.556667 |
| 2021-01-01 00:50:00 | 1954.37 | 8.0 | 6.9 | 93.0 | 0.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 50 | 1969.470000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2024-12-12 00:10:00 | 2146.84 | 12.3 | 6.8 | 69.0 | 269.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 10 | 2174.893333 |
| 2024-12-12 00:15:00 | 2116.66 | 12.3 | 6.8 | 69.0 | 269.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 15 | 2139.416667 |
| 2024-12-12 00:20:00 | 2082.77 | 12.3 | 6.8 | 69.0 | 269.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 20 | 2115.423333 |
| 2024-12-12 00:25:00 | 2059.17 | 12.3 | 6.8 | 69.0 | 269.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 25 | 2086.200000 |
| 2024-12-12 00:30:00 | 2049.66 | 12.3 | 6.8 | 69.0 | 269.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 30 | 2063.866667 |
393440 rows × 13 columns
In [12]:
#summary statrictics of a few columns
data[['Power demand','temp','rhum']].describe()
Out[12]:
| Power demand | temp | rhum | |
|---|---|---|---|
| count | 393440.000000 | 393440.000000 | 393440.000000 |
| mean | 3960.736469 | 25.527913 | 63.435767 |
| std | 1300.473773 | 7.981563 | 24.850663 |
| min | 1302.080000 | 4.000000 | 5.000000 |
| 25% | 3074.900000 | 20.000000 | 44.000000 |
| 50% | 3832.320000 | 27.000000 | 67.000000 |
| 75% | 4870.465000 | 31.000000 | 84.000000 |
| max | 8631.530000 | 46.400000 | 100.000000 |
In [13]:
#exploring how many missing values there are
data.isnull().sum()
Out[13]:
Power demand 0 temp 0 dwpt 0 rhum 0 wdir 540 wspd 0 pres 0 year 0 month 0 day 0 hour 0 minute 0 moving_avg_3 2 dtype: int64
In [14]:
data = data.drop(columns=['wdir'])
In [16]:
#show the records that contain any null values
data[data.isna().any(axis=1)]
Out[16]:
| Power demand | temp | dwpt | rhum | wspd | pres | year | month | day | hour | minute | moving_avg_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime | ||||||||||||
| 2021-01-01 00:30:00 | 2014.00 | 8.0 | 6.9 | 93.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 30 | NaN |
| 2021-01-01 00:35:00 | 2005.63 | 8.0 | 6.9 | 93.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 35 | NaN |
In [18]:
#drop all records that contain all missing values
data = data.dropna(how = 'all')
data[data.isna().all(axis=1)]
Out[18]:
| Power demand | temp | dwpt | rhum | wspd | pres | year | month | day | hour | minute | moving_avg_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime |
In [22]:
data.tail()
Out[22]:
| Power demand | temp | dwpt | rhum | wspd | pres | year | month | day | hour | minute | moving_avg_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime | ||||||||||||
| 2024-12-12 00:10:00 | 2146.84 | 12.3 | 6.8 | 69.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 10 | 2174.893333 |
| 2024-12-12 00:15:00 | 2116.66 | 12.3 | 6.8 | 69.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 15 | 2139.416667 |
| 2024-12-12 00:20:00 | 2082.77 | 12.3 | 6.8 | 69.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 20 | 2115.423333 |
| 2024-12-12 00:25:00 | 2059.17 | 12.3 | 6.8 | 69.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 25 | 2086.200000 |
| 2024-12-12 00:30:00 | 2049.66 | 12.3 | 6.8 | 69.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 30 | 2063.866667 |
In [23]:
#making sure that the index is in the datetime formart
data.index = pd.to_datetime(data.index)
In [24]:
#adding the "dayofweek" and "dayofyear" columns
data['dayofweek'] = data.index.dayofweek + 1 #monday as 1 and Sunday as 7
data['dayofyear'] = data.index.dayofyear #numbering the days from day 1 to day 366
In [26]:
data.tail()
Out[26]:
| Power demand | temp | dwpt | rhum | wspd | pres | year | month | day | hour | minute | moving_avg_3 | dayofweek | dayofyear | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime | ||||||||||||||
| 2024-12-12 00:10:00 | 2146.84 | 12.3 | 6.8 | 69.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 10 | 2174.893333 | 4 | 347 |
| 2024-12-12 00:15:00 | 2116.66 | 12.3 | 6.8 | 69.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 15 | 2139.416667 | 4 | 347 |
| 2024-12-12 00:20:00 | 2082.77 | 12.3 | 6.8 | 69.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 20 | 2115.423333 | 4 | 347 |
| 2024-12-12 00:25:00 | 2059.17 | 12.3 | 6.8 | 69.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 25 | 2086.200000 | 4 | 347 |
| 2024-12-12 00:30:00 | 2049.66 | 12.3 | 6.8 | 69.0 | 1.8 | 1019.4 | 2024 | 12 | 12 | 0 | 30 | 2063.866667 | 4 | 347 |
In [31]:
#Feature engineering, creating of year quaters
data['quarter'] = data.index.quarter
data.head()
Out[31]:
| Power demand | temp | dwpt | rhum | wspd | pres | year | month | day | hour | minute | moving_avg_3 | dayofweek | dayofyear | quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime | |||||||||||||||
| 2021-01-01 00:30:00 | 2014.00 | 8.0 | 6.9 | 93.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 30 | NaN | 5 | 1 | 1 |
| 2021-01-01 00:35:00 | 2005.63 | 8.0 | 6.9 | 93.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 35 | NaN | 5 | 1 | 1 |
| 2021-01-01 00:40:00 | 1977.60 | 8.0 | 6.9 | 93.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 40 | 1999.076667 | 5 | 1 | 1 |
| 2021-01-01 00:45:00 | 1976.44 | 8.0 | 6.9 | 93.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 45 | 1986.556667 | 5 | 1 | 1 |
| 2021-01-01 00:50:00 | 1954.37 | 8.0 | 6.9 | 93.0 | 0.0 | 1017.0 | 2021 | 1 | 1 | 0 | 50 | 1969.470000 | 5 | 1 | 1 |
In [32]:
data.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 393440 entries, 2021-01-01 00:30:00 to 2024-12-12 00:30:00 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Power demand 393440 non-null float64 1 temp 393440 non-null float64 2 dwpt 393440 non-null float64 3 rhum 393440 non-null float64 4 wspd 393440 non-null float64 5 pres 393440 non-null float64 6 year 393440 non-null int64 7 month 393440 non-null int64 8 day 393440 non-null int64 9 hour 393440 non-null int64 10 minute 393440 non-null int64 11 moving_avg_3 393438 non-null float64 12 dayofweek 393440 non-null int32 13 dayofyear 393440 non-null int32 14 quarter 393440 non-null int32 dtypes: float64(7), int32(3), int64(5) memory usage: 43.5 MB
In [34]:
#create another column for the "weekofyear"
data.insert(7,'weekofyear', data.index.isocalendar().week.astype(int))
In [42]:
data.head(1000)
Out[42]:
| Power demand | temp | dwpt | rhum | wspd | pres | year | weekofyear | month | day | hour | minute | moving_avg_3 | dayofweek | dayofyear | quarter | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime | ||||||||||||||||
| 2021-01-01 00:30:00 | 2014.00 | 8.0 | 6.9 | 93.0 | 0.0 | 1017.0 | 2021 | 53 | 1 | 1 | 0 | 30 | NaN | 5 | 1 | 1 |
| 2021-01-01 00:35:00 | 2005.63 | 8.0 | 6.9 | 93.0 | 0.0 | 1017.0 | 2021 | 53 | 1 | 1 | 0 | 35 | NaN | 5 | 1 | 1 |
| 2021-01-01 00:40:00 | 1977.60 | 8.0 | 6.9 | 93.0 | 0.0 | 1017.0 | 2021 | 53 | 1 | 1 | 0 | 40 | 1999.076667 | 5 | 1 | 1 |
| 2021-01-01 00:45:00 | 1976.44 | 8.0 | 6.9 | 93.0 | 0.0 | 1017.0 | 2021 | 53 | 1 | 1 | 0 | 45 | 1986.556667 | 5 | 1 | 1 |
| 2021-01-01 00:50:00 | 1954.37 | 8.0 | 6.9 | 93.0 | 0.0 | 1017.0 | 2021 | 53 | 1 | 1 | 0 | 50 | 1969.470000 | 5 | 1 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2021-01-04 11:25:00 | 3505.59 | 16.0 | 14.0 | 88.0 | 9.4 | 1018.0 | 2021 | 1 | 1 | 4 | 11 | 25 | 3489.023333 | 1 | 4 | 1 |
| 2021-01-04 11:30:00 | 3471.75 | 17.8 | 16.5 | 92.0 | 7.6 | 1017.3 | 2021 | 1 | 1 | 4 | 11 | 30 | 3489.096667 | 1 | 4 | 1 |
| 2021-01-04 11:35:00 | 3487.28 | 17.8 | 16.5 | 92.0 | 7.6 | 1017.3 | 2021 | 1 | 1 | 4 | 11 | 35 | 3488.206667 | 1 | 4 | 1 |
| 2021-01-04 11:40:00 | 3501.53 | 17.8 | 16.5 | 92.0 | 7.6 | 1017.3 | 2021 | 1 | 1 | 4 | 11 | 40 | 3486.853333 | 1 | 4 | 1 |
| 2021-01-04 11:45:00 | 3481.15 | 17.8 | 16.5 | 92.0 | 7.6 | 1017.3 | 2021 | 1 | 1 | 4 | 11 | 45 | 3489.986667 | 1 | 4 | 1 |
1000 rows × 16 columns
In [43]:
#create anew column to check if the date is a weekend or not
data.insert(7, 'is_weekend', data.index.dayofweek.isin([6,7]))
In [45]:
data.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 393440 entries, 2021-01-01 00:30:00 to 2024-12-12 00:30:00 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Power demand 393440 non-null float64 1 temp 393440 non-null float64 2 dwpt 393440 non-null float64 3 rhum 393440 non-null float64 4 wspd 393440 non-null float64 5 pres 393440 non-null float64 6 year 393440 non-null int64 7 is_weekend 393440 non-null bool 8 weekofyear 393440 non-null int64 9 month 393440 non-null int64 10 day 393440 non-null int64 11 hour 393440 non-null int64 12 minute 393440 non-null int64 13 moving_avg_3 393438 non-null float64 14 dayofweek 393440 non-null int32 15 dayofyear 393440 non-null int32 16 quarter 393440 non-null int32 dtypes: bool(1), float64(7), int32(3), int64(6) memory usage: 46.9 MB
In [46]:
data['is_weekend'] = data['is_weekend'].astype(int)
In [47]:
data.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 393440 entries, 2021-01-01 00:30:00 to 2024-12-12 00:30:00 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Power demand 393440 non-null float64 1 temp 393440 non-null float64 2 dwpt 393440 non-null float64 3 rhum 393440 non-null float64 4 wspd 393440 non-null float64 5 pres 393440 non-null float64 6 year 393440 non-null int64 7 is_weekend 393440 non-null int64 8 weekofyear 393440 non-null int64 9 month 393440 non-null int64 10 day 393440 non-null int64 11 hour 393440 non-null int64 12 minute 393440 non-null int64 13 moving_avg_3 393438 non-null float64 14 dayofweek 393440 non-null int32 15 dayofyear 393440 non-null int32 16 quarter 393440 non-null int32 dtypes: float64(7), int32(3), int64(7) memory usage: 49.5 MB
In [49]:
#install and importing the holidays package for checking if the particular day falls on a public holiday or not
!pip install holidays
import holidays
Collecting holidays Downloading holidays-0.92-py3-none-any.whl.metadata (50 kB) Requirement already satisfied: python-dateutil<3,>=2.9.0.post0 in c:\users\t\anaconda3\lib\site-packages (from holidays) (2.9.0.post0) Requirement already satisfied: six>=1.5 in c:\users\t\anaconda3\lib\site-packages (from python-dateutil<3,>=2.9.0.post0->holidays) (1.17.0) Downloading holidays-0.92-py3-none-any.whl (1.4 MB) ---------------------------------------- 0.0/1.4 MB ? eta -:--:-- ---------------------------------------- 1.4/1.4 MB 10.4 MB/s 0:00:00 Installing collected packages: holidays Successfully installed holidays-0.92
In [50]:
holidays.IN(years= data.year)
Out[50]:
{datetime.date(2024, 1, 26): 'Republic Day', datetime.date(2024, 8, 15): 'Independence Day', datetime.date(2024, 10, 2): 'Gandhi Jayanti', datetime.date(2024, 5, 23): 'Buddha Purnima', datetime.date(2024, 10, 31): 'Diwali', datetime.date(2024, 8, 26): 'Janmashtami', datetime.date(2024, 10, 12): 'Dussehra', datetime.date(2024, 4, 21): 'Mahavir Jayanti', datetime.date(2024, 3, 8): 'Maha Shivaratri', datetime.date(2024, 11, 15): 'Guru Nanak Jayanti', datetime.date(2024, 7, 17): 'Muharram', datetime.date(2024, 9, 16): 'Milad-un-Nabi', datetime.date(2024, 4, 11): 'Id-ul-Fitr', datetime.date(2024, 6, 17): 'Bakrid', datetime.date(2024, 3, 29): 'Good Friday', datetime.date(2024, 12, 25): 'Christmas', datetime.date(2021, 1, 26): 'Republic Day', datetime.date(2021, 8, 15): 'Independence Day', datetime.date(2021, 10, 2): 'Gandhi Jayanti', datetime.date(2021, 5, 26): 'Buddha Purnima', datetime.date(2021, 11, 4): 'Diwali', datetime.date(2021, 8, 30): 'Janmashtami', datetime.date(2021, 10, 15): 'Dussehra', datetime.date(2021, 4, 25): 'Mahavir Jayanti', datetime.date(2021, 3, 11): 'Maha Shivaratri', datetime.date(2021, 11, 19): 'Guru Nanak Jayanti', datetime.date(2021, 8, 20): 'Muharram', datetime.date(2021, 10, 19): 'Milad-un-Nabi', datetime.date(2021, 5, 14): 'Id-ul-Fitr', datetime.date(2021, 7, 21): 'Bakrid', datetime.date(2021, 4, 2): 'Good Friday', datetime.date(2021, 12, 25): 'Christmas', datetime.date(2022, 1, 26): 'Republic Day', datetime.date(2022, 8, 15): 'Independence Day', datetime.date(2022, 10, 2): 'Gandhi Jayanti', datetime.date(2022, 5, 16): 'Buddha Purnima', datetime.date(2022, 10, 24): 'Diwali', datetime.date(2022, 8, 19): 'Janmashtami', datetime.date(2022, 10, 5): 'Dussehra', datetime.date(2022, 4, 14): 'Mahavir Jayanti', datetime.date(2022, 3, 1): 'Maha Shivaratri', datetime.date(2022, 11, 8): 'Guru Nanak Jayanti', datetime.date(2022, 8, 9): 'Muharram', datetime.date(2022, 10, 9): 'Milad-un-Nabi', datetime.date(2022, 5, 3): 'Id-ul-Fitr', datetime.date(2022, 7, 10): 'Bakrid', datetime.date(2022, 4, 15): 'Good Friday', datetime.date(2022, 12, 25): 'Christmas', datetime.date(2023, 1, 26): 'Republic Day', datetime.date(2023, 8, 15): 'Independence Day', datetime.date(2023, 10, 2): 'Gandhi Jayanti', datetime.date(2023, 5, 5): 'Buddha Purnima', datetime.date(2023, 11, 12): 'Diwali', datetime.date(2023, 9, 7): 'Janmashtami', datetime.date(2023, 10, 24): 'Dussehra', datetime.date(2023, 4, 4): 'Mahavir Jayanti', datetime.date(2023, 2, 18): 'Maha Shivaratri', datetime.date(2023, 11, 27): 'Guru Nanak Jayanti', datetime.date(2023, 7, 29): 'Muharram', datetime.date(2023, 9, 28): 'Milad-un-Nabi', datetime.date(2023, 4, 22): 'Id-ul-Fitr', datetime.date(2023, 6, 29): 'Bakrid', datetime.date(2023, 4, 7): 'Good Friday', datetime.date(2023, 12, 25): 'Christmas'}
In [57]:
#saving the new data in its own column
data['Holidays'] = holidays.IN(years= data.year)
data.head(3)
Out[57]:
| Power demand | temp | dwpt | rhum | wspd | pres | year | is_weekend | weekofyear | month | day | hour | minute | moving_avg_3 | dayofweek | dayofyear | quarter | Holidays | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime | ||||||||||||||||||
| 2021-01-01 00:30:00 | 2014.00 | 8.0 | 6.9 | 93.0 | 0.0 | 1017.0 | 2021 | 0 | 53 | 1 | 1 | 0 | 30 | NaN | 5 | 1 | 1 | NaN |
| 2021-01-01 00:35:00 | 2005.63 | 8.0 | 6.9 | 93.0 | 0.0 | 1017.0 | 2021 | 0 | 53 | 1 | 1 | 0 | 35 | NaN | 5 | 1 | 1 | NaN |
| 2021-01-01 00:40:00 | 1977.60 | 8.0 | 6.9 | 93.0 | 0.0 | 1017.0 | 2021 | 0 | 53 | 1 | 1 | 0 | 40 | 1999.076667 | 5 | 1 | 1 | NaN |
In [58]:
# Dropping some columns we do not need
columns_to_drop = ['dwpt', 'wspd', 'pres', 'minute', 'day', 'moving_avg_3']
# Drop the columns
data.drop(columns=columns_to_drop, inplace=True)
# Check the result of the remaining columns
print(data.columns.tolist())
['Power demand', 'temp', 'rhum', 'year', 'is_weekend', 'weekofyear', 'month', 'hour', 'dayofweek', 'dayofyear', 'quarter', 'Holidays']
In [59]:
data.head()
Out[59]:
| Power demand | temp | rhum | year | is_weekend | weekofyear | month | hour | dayofweek | dayofyear | quarter | Holidays | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime | ||||||||||||
| 2021-01-01 00:30:00 | 2014.00 | 8.0 | 93.0 | 2021 | 0 | 53 | 1 | 0 | 5 | 1 | 1 | NaN |
| 2021-01-01 00:35:00 | 2005.63 | 8.0 | 93.0 | 2021 | 0 | 53 | 1 | 0 | 5 | 1 | 1 | NaN |
| 2021-01-01 00:40:00 | 1977.60 | 8.0 | 93.0 | 2021 | 0 | 53 | 1 | 0 | 5 | 1 | 1 | NaN |
| 2021-01-01 00:45:00 | 1976.44 | 8.0 | 93.0 | 2021 | 0 | 53 | 1 | 0 | 5 | 1 | 1 | NaN |
| 2021-01-01 00:50:00 | 1954.37 | 8.0 | 93.0 | 2021 | 0 | 53 | 1 | 0 | 5 | 1 | 1 | NaN |
In [60]:
#Creating some laggged features
#These are past demand values: strong predictors which can be used as input to predict future values
#enables the models to learn from the historical data and improve from historical accurancy
In [61]:
#creating a demand lag feature from the same hour yesterday, and from the same hour last week
data['demand_lagged_24hr'] = data ['Power demand'].shift(24)
In [70]:
#creating another feature fro the demand lagged one week
#24hrs * 7 = 168 hours for a weekly shift
data['demand_lagged_168hr'] = data ['Power demand'].shift(186)
In [71]:
#creating two columns, one for the moving average and another for the standard deviation
In [72]:
data['demand_rolling_mean_24hr'] = data['Power demand'].rolling(window=24).mean()
In [76]:
data['demand_rolling_std_24hr'] = data['Power demand'].rolling(window=24).std()
In [ ]:
#dropping the holiday column since we will ont use it
data = data.drop('Holidays',axis=1)
In [80]:
#dropping all the rows having nan values
data = data.dropna()
data
Out[80]:
| Power demand | temp | rhum | year | is_weekend | weekofyear | month | hour | dayofweek | dayofyear | quarter | demand_lagged_24hr | demand_lagged_168hr | demand_rolling_mean_24hr | demand_rolling_std_24hr | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime | |||||||||||||||
| 2021-01-01 16:00:00 | 3449.66 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3938.71 | 2014.00 | 3648.217083 | 159.107017 |
| 2021-01-01 16:05:00 | 3415.85 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3918.13 | 2005.63 | 3627.288750 | 155.042152 |
| 2021-01-01 16:10:00 | 3420.58 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3905.85 | 1977.60 | 3607.069167 | 148.645444 |
| 2021-01-01 16:15:00 | 3406.31 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3867.80 | 1976.44 | 3587.840417 | 143.200310 |
| 2021-01-01 16:20:00 | 3417.71 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3855.52 | 1954.37 | 3569.598333 | 135.285687 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2024-12-12 00:10:00 | 2146.84 | 12.3 | 69.0 | 2024 | 0 | 50 | 12 | 0 | 4 | 347 | 4 | 3446.53 | 3588.77 | 3120.540417 | 373.904837 |
| 2024-12-12 00:15:00 | 2116.66 | 12.3 | 69.0 | 2024 | 0 | 50 | 12 | 0 | 4 | 347 | 4 | 3402.37 | 3568.09 | 3066.969167 | 420.919382 |
| 2024-12-12 00:20:00 | 2082.77 | 12.3 | 69.0 | 2024 | 0 | 50 | 12 | 0 | 4 | 347 | 4 | 3348.17 | 3457.49 | 3014.244167 | 461.464483 |
| 2024-12-12 00:25:00 | 2059.17 | 12.3 | 69.0 | 2024 | 0 | 50 | 12 | 0 | 4 | 347 | 4 | 3311.41 | 3469.89 | 2962.067500 | 495.911949 |
| 2024-12-12 00:30:00 | 2049.66 | 12.3 | 69.0 | 2024 | 0 | 50 | 12 | 0 | 4 | 347 | 4 | 3297.00 | 3413.51 | 2910.095000 | 523.858573 |
393254 rows × 15 columns
In [83]:
#plot raw data over time (demand over time)
data['Power demand'].plot(figsize=(15,6), title = 'Electricity Demand Over Time')
plt.xlabel('Year')
plt.ylabel('Demand (in MW)')
plt.show()
In [85]:
#visualize the demand by hour of the day
plt.figure(figsize = (15,6))
sns.boxplot(data = data, x = 'hour', y = 'Power demand')
plt.title("Demand by the Hour")
plt.show()
In [87]:
#visualize the demand by month
plt.figure (figsize=(15,6))
sns.boxplot(data = data, x = 'month', y ='Power demand')
plt.title("Demand by Month")
plt.show()
In [92]:
plt.figure(figsize=(10,10))
sns.regplot(data=data, x='temp', y='Power demand',
scatter_kws={'alpha':0.5},
line_kws={'color': 'black', 'linewidth': 2})
plt.title("Demand vs Temperature with Trend Line")
plt.show()
In [97]:
#showinng the correlation matrix between the different features of the dataframe
plt.figure(figsize=(12,8))
sns.heatmap(data.corr(), annot=True, fmt=".2f",cmap='YlGnBu')
plt.title('Correlation Matrix of Features')
plt.show()
In [99]:
#Building the prediction model, at the end of the day, we need to predict the power demand
In [100]:
#renaming the demand column
data.rename(columns={'Power demand': 'demand'}, inplace=True)
In [101]:
data.head()
Out[101]:
| demand | temp | rhum | year | is_weekend | weekofyear | month | hour | dayofweek | dayofyear | quarter | demand_lagged_24hr | demand_lagged_168hr | demand_rolling_mean_24hr | demand_rolling_std_24hr | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime | |||||||||||||||
| 2021-01-01 16:00:00 | 3449.66 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3938.71 | 2014.00 | 3648.217083 | 159.107017 |
| 2021-01-01 16:05:00 | 3415.85 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3918.13 | 2005.63 | 3627.288750 | 155.042152 |
| 2021-01-01 16:10:00 | 3420.58 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3905.85 | 1977.60 | 3607.069167 | 148.645444 |
| 2021-01-01 16:15:00 | 3406.31 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3867.80 | 1976.44 | 3587.840417 | 143.200310 |
| 2021-01-01 16:20:00 | 3417.71 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3855.52 | 1954.37 | 3569.598333 | 135.285687 |
In [106]:
#defining the target variable (Y) dependent variable
Y = data.demand
In [107]:
#defining the features (X) predictors/ features/ variables
X = data.drop('demand', axis = 1)
In [108]:
print(Y)
datetime
2021-01-01 16:00:00 3449.66
2021-01-01 16:05:00 3415.85
2021-01-01 16:10:00 3420.58
2021-01-01 16:15:00 3406.31
2021-01-01 16:20:00 3417.71
...
2024-12-12 00:10:00 2146.84
2024-12-12 00:15:00 2116.66
2024-12-12 00:20:00 2082.77
2024-12-12 00:25:00 2059.17
2024-12-12 00:30:00 2049.66
Name: demand, Length: 393254, dtype: float64
In [109]:
X
Out[109]:
| temp | rhum | year | is_weekend | weekofyear | month | hour | dayofweek | dayofyear | quarter | demand_lagged_24hr | demand_lagged_168hr | demand_rolling_mean_24hr | demand_rolling_std_24hr | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime | ||||||||||||||
| 2021-01-01 16:00:00 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3938.71 | 2014.00 | 3648.217083 | 159.107017 |
| 2021-01-01 16:05:00 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3918.13 | 2005.63 | 3627.288750 | 155.042152 |
| 2021-01-01 16:10:00 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3905.85 | 1977.60 | 3607.069167 | 148.645444 |
| 2021-01-01 16:15:00 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3867.80 | 1976.44 | 3587.840417 | 143.200310 |
| 2021-01-01 16:20:00 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3855.52 | 1954.37 | 3569.598333 | 135.285687 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2024-12-12 00:10:00 | 12.3 | 69.0 | 2024 | 0 | 50 | 12 | 0 | 4 | 347 | 4 | 3446.53 | 3588.77 | 3120.540417 | 373.904837 |
| 2024-12-12 00:15:00 | 12.3 | 69.0 | 2024 | 0 | 50 | 12 | 0 | 4 | 347 | 4 | 3402.37 | 3568.09 | 3066.969167 | 420.919382 |
| 2024-12-12 00:20:00 | 12.3 | 69.0 | 2024 | 0 | 50 | 12 | 0 | 4 | 347 | 4 | 3348.17 | 3457.49 | 3014.244167 | 461.464483 |
| 2024-12-12 00:25:00 | 12.3 | 69.0 | 2024 | 0 | 50 | 12 | 0 | 4 | 347 | 4 | 3311.41 | 3469.89 | 2962.067500 | 495.911949 |
| 2024-12-12 00:30:00 | 12.3 | 69.0 | 2024 | 0 | 50 | 12 | 0 | 4 | 347 | 4 | 3297.00 | 3413.51 | 2910.095000 | 523.858573 |
393254 rows × 14 columns
In [110]:
#splitting the data into training and testing data by 20% to 80%
X_train = X.loc[: '2024-02-29'] #taking 80% of the data as testing data
In [111]:
X_train
Out[111]:
| temp | rhum | year | is_weekend | weekofyear | month | hour | dayofweek | dayofyear | quarter | demand_lagged_24hr | demand_lagged_168hr | demand_rolling_mean_24hr | demand_rolling_std_24hr | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| datetime | ||||||||||||||
| 2021-01-01 16:00:00 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3938.71 | 2014.00 | 3648.217083 | 159.107017 |
| 2021-01-01 16:05:00 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3918.13 | 2005.63 | 3627.288750 | 155.042152 |
| 2021-01-01 16:10:00 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3905.85 | 1977.60 | 3607.069167 | 148.645444 |
| 2021-01-01 16:15:00 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3867.80 | 1976.44 | 3587.840417 | 143.200310 |
| 2021-01-01 16:20:00 | 18.0 | 42.0 | 2021 | 0 | 53 | 1 | 16 | 5 | 1 | 1 | 3855.52 | 1954.37 | 3569.598333 | 135.285687 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2024-02-29 23:25:00 | 18.0 | 64.0 | 2024 | 0 | 9 | 2 | 23 | 4 | 60 | 1 | 2926.06 | 3766.91 | 2627.903750 | 168.803709 |
| 2024-02-29 23:30:00 | 18.0 | 75.0 | 2024 | 0 | 9 | 2 | 23 | 4 | 60 | 1 | 2894.40 | 3832.04 | 2605.032917 | 168.310756 |
| 2024-02-29 23:35:00 | 18.0 | 75.0 | 2024 | 0 | 9 | 2 | 23 | 4 | 60 | 1 | 2841.57 | 3787.23 | 2582.132500 | 172.076109 |
| 2024-02-29 23:40:00 | 18.0 | 75.0 | 2024 | 0 | 9 | 2 | 23 | 4 | 60 | 1 | 2865.73 | 3792.01 | 2557.773333 | 171.563203 |
| 2024-02-29 23:45:00 | 18.0 | 75.0 | 2024 | 0 | 9 | 2 | 23 | 4 | 60 | 1 | 2821.97 | 3833.37 | 2534.717083 | 171.696401 |
314928 rows × 14 columns
In [113]:
Y_train = Y.loc[:'2024-02-29'] #taking 80% of the data as the training data
In [123]:
X_test = X.loc['2024-03-01':] #20% of testing dataset
In [120]:
Y_test = Y.loc['2024-03-01':]
In [124]:
print (X_train.shape)
print(Y_train.shape)
(314928, 14) (314928,)
In [125]:
print (X_test.shape)
print(Y_test.shape)
(78326, 14) (78326,)
In [128]:
#XGBoost Model
#XGboost used for time series forecasting, as it can handle non-linear data easily.
#it can handle seasonality and trends in time series data effectively.
In [130]:
!pip install xgboost
from xgboost import XGBRegressor
Collecting xgboost Downloading xgboost-3.2.0-py3-none-win_amd64.whl.metadata (2.1 kB) Requirement already satisfied: numpy in c:\users\t\anaconda3\lib\site-packages (from xgboost) (2.3.5) Requirement already satisfied: scipy in c:\users\t\anaconda3\lib\site-packages (from xgboost) (1.16.3) Downloading xgboost-3.2.0-py3-none-win_amd64.whl (101.7 MB) ---------------------------------------- 0.0/101.7 MB ? eta -:--:-- ---------------------------------------- 0.3/101.7 MB ? eta -:--:-- - -------------------------------------- 3.4/101.7 MB 13.6 MB/s eta 0:00:08 -- ------------------------------------- 5.8/101.7 MB 12.8 MB/s eta 0:00:08 --- ------------------------------------ 7.9/101.7 MB 11.6 MB/s eta 0:00:09 ---- ----------------------------------- 10.5/101.7 MB 11.7 MB/s eta 0:00:08 ----- ---------------------------------- 13.1/101.7 MB 11.8 MB/s eta 0:00:08 ----- ---------------------------------- 14.9/101.7 MB 11.6 MB/s eta 0:00:08 ----- ---------------------------------- 15.2/101.7 MB 9.8 MB/s eta 0:00:09 ------- -------------------------------- 19.1/101.7 MB 10.8 MB/s eta 0:00:08 -------- ------------------------------- 22.0/101.7 MB 11.1 MB/s eta 0:00:08 --------- ------------------------------ 24.4/101.7 MB 11.1 MB/s eta 0:00:07 ---------- ----------------------------- 26.2/101.7 MB 10.8 MB/s eta 0:00:07 ----------- ---------------------------- 28.8/101.7 MB 11.0 MB/s eta 0:00:07 ------------ --------------------------- 31.5/101.7 MB 11.1 MB/s eta 0:00:07 ------------- -------------------------- 34.3/101.7 MB 11.3 MB/s eta 0:00:06 -------------- ------------------------- 37.2/101.7 MB 11.4 MB/s eta 0:00:06 --------------- ------------------------ 39.3/101.7 MB 11.3 MB/s eta 0:00:06 ---------------- ----------------------- 41.9/101.7 MB 11.4 MB/s eta 0:00:06 ----------------- ---------------------- 44.3/101.7 MB 11.3 MB/s eta 0:00:06 ------------------ --------------------- 46.7/101.7 MB 11.3 MB/s eta 0:00:05 ------------------- -------------------- 49.0/101.7 MB 11.3 MB/s eta 0:00:05 -------------------- ------------------- 51.4/101.7 MB 11.3 MB/s eta 0:00:05 --------------------- ------------------ 54.0/101.7 MB 11.3 MB/s eta 0:00:05 ---------------------- ----------------- 56.4/101.7 MB 11.3 MB/s eta 0:00:05 ----------------------- ---------------- 59.2/101.7 MB 11.4 MB/s eta 0:00:04 ------------------------ --------------- 61.9/101.7 MB 11.4 MB/s eta 0:00:04 ------------------------- -------------- 64.7/101.7 MB 11.5 MB/s eta 0:00:04 -------------------------- ------------- 67.1/101.7 MB 11.5 MB/s eta 0:00:04 -------------------------- ------------- 67.4/101.7 MB 11.5 MB/s eta 0:00:03 ---------------------------- ----------- 71.6/101.7 MB 11.4 MB/s eta 0:00:03 ---------------------------- ----------- 73.7/101.7 MB 11.4 MB/s eta 0:00:03 ------------------------------ --------- 76.5/101.7 MB 11.4 MB/s eta 0:00:03 ------------------------------- -------- 78.9/101.7 MB 11.4 MB/s eta 0:00:02 ------------------------------- -------- 80.7/101.7 MB 11.3 MB/s eta 0:00:02 -------------------------------- ------- 82.6/101.7 MB 11.2 MB/s eta 0:00:02 -------------------------------- ------- 83.9/101.7 MB 11.1 MB/s eta 0:00:02 --------------------------------- ------ 85.7/101.7 MB 11.0 MB/s eta 0:00:02 ---------------------------------- ----- 88.3/101.7 MB 11.0 MB/s eta 0:00:02 ----------------------------------- ---- 91.2/101.7 MB 11.1 MB/s eta 0:00:01 ------------------------------------ --- 93.8/101.7 MB 11.1 MB/s eta 0:00:01 ------------------------------------- -- 95.7/101.7 MB 11.1 MB/s eta 0:00:01 -------------------------------------- - 98.3/101.7 MB 11.1 MB/s eta 0:00:01 -------------------------------------- 100.9/101.7 MB 11.1 MB/s eta 0:00:01 -------------------------------------- 101.4/101.7 MB 11.1 MB/s eta 0:00:01 ---------------------------------------- 101.7/101.7 MB 10.8 MB/s 0:00:09 Installing collected packages: xgboost Successfully installed xgboost-3.2.0
In [135]:
#from sklearn library importing two fundamental metrics to calculate the performance of the model
from sklearn.metrics import mean_squared_error, mean_absolute_error
#from model_selection module of scikit-learn's library, import timeseries split class
#which perfomrs appropriate cross validation when dealing with time series data
from sklearn.model_selection import TimeSeriesSplit
In [136]:
#Initializing and training the model
model_xgb = XGBRegressor(n_estimators=1000,
early_stopping_rounds=50,
learning_rate=0.01,
random_state=42,
objective='reg:squarederror')
In [137]:
model_xgb.fit(X_train, Y_train, eval_set = [(X_train, Y_train), (X_test,Y_test)], verbose=False)
Out[137]:
XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=50,
enable_categorical=False, eval_metric=None, feature_types=None,
feature_weights=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.01, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimators=1000,
n_jobs=None, num_parallel_tree=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
| objective | 'reg:squarederror' | |
| base_score | None | |
| booster | None | |
| callbacks | None | |
| colsample_bylevel | None | |
| colsample_bynode | None | |
| colsample_bytree | None | |
| device | None | |
| early_stopping_rounds | 50 | |
| enable_categorical | False | |
| eval_metric | None | |
| feature_types | None | |
| feature_weights | None | |
| gamma | None | |
| grow_policy | None | |
| importance_type | None | |
| interaction_constraints | None | |
| learning_rate | 0.01 | |
| max_bin | None | |
| max_cat_threshold | None | |
| max_cat_to_onehot | None | |
| max_delta_step | None | |
| max_depth | None | |
| max_leaves | None | |
| min_child_weight | None | |
| missing | nan | |
| monotone_constraints | None | |
| multi_strategy | None | |
| n_estimators | 1000 | |
| n_jobs | None | |
| num_parallel_tree | None | |
| random_state | 42 | |
| reg_alpha | None | |
| reg_lambda | None | |
| sampling_method | None | |
| scale_pos_weight | None | |
| subsample | None | |
| tree_method | None | |
| validate_parameters | None | |
| verbosity | None |
In [138]:
#making predictions
predictions_xgb = model_xgb.predict(X_test)
In [139]:
#evaluating the model
rmse_xgb = np.sqrt(mean_squared_error(Y_test,predictions_xgb))
mae_xgb = mean_absolute_error(Y_test,predictions_xgb)
In [141]:
print('XGBoost RMSE:', rmse_xgb)
print('XGBoost MAE:', mae_xgb)
XGBoost RMSE: 191.6282277687719 XGBoost MAE: 88.88229013144982
In [152]:
#visualize the predictions
plt.figure(figsize = (15,6))
plt.plot(Y_test.index,Y_test, label= 'Actual demand')
plt.plot(Y_test.index, predictions_xgb, label='Predicted Demand', color = 'Red')
plt.title('XGBoost Electricity demand Prediction')
plt.xlabel('Date')
plt.ylabel('Power Demand')
plt.legend()
plt.show()
In [153]:
#saving the model
import joblib
joblib.dump(model_xgb, 'electricity_xbg_prediction_model.pkl')
Out[153]:
['electricity_xbg_prediction_model.pkl']
In [ ]: