In [1]:
#importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import random
In [2]:
#importing the dataset

data = pd.read_csv(r"C:\Users\T\Desktop\UCT Integrated Data Energy Analytics\Energy Data Sets\Delhi 5 Minute Electricity Demand for Forecasting Dataset\Delhi_electrictity_demand_data.csv")
In [3]:
data
Out[3]:
Unnamed: 0 datetime Power demand temp dwpt rhum wdir wspd pres year month day hour minute moving_avg_3
0 0 2021-01-01 00:30:00 2014.00 8.0 6.9 93.0 0.0 0.0 1017.0 2021 1 1 0 30 NaN
1 1 2021-01-01 00:35:00 2005.63 8.0 6.9 93.0 0.0 0.0 1017.0 2021 1 1 0 35 NaN
2 2 2021-01-01 00:40:00 1977.60 8.0 6.9 93.0 0.0 0.0 1017.0 2021 1 1 0 40 1999.076667
3 3 2021-01-01 00:45:00 1976.44 8.0 6.9 93.0 0.0 0.0 1017.0 2021 1 1 0 45 1986.556667
4 4 2021-01-01 00:50:00 1954.37 8.0 6.9 93.0 0.0 0.0 1017.0 2021 1 1 0 50 1969.470000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
393435 393435 2024-12-12 00:10:00 2146.84 12.3 6.8 69.0 269.0 1.8 1019.4 2024 12 12 0 10 2174.893333
393436 393436 2024-12-12 00:15:00 2116.66 12.3 6.8 69.0 269.0 1.8 1019.4 2024 12 12 0 15 2139.416667
393437 393437 2024-12-12 00:20:00 2082.77 12.3 6.8 69.0 269.0 1.8 1019.4 2024 12 12 0 20 2115.423333
393438 393438 2024-12-12 00:25:00 2059.17 12.3 6.8 69.0 269.0 1.8 1019.4 2024 12 12 0 25 2086.200000
393439 393439 2024-12-12 00:30:00 2049.66 12.3 6.8 69.0 269.0 1.8 1019.4 2024 12 12 0 30 2063.866667

393440 rows × 15 columns

In [4]:
#checking the basic informatrion about the data
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393440 entries, 0 to 393439
Data columns (total 15 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Unnamed: 0    393440 non-null  int64  
 1   datetime      393440 non-null  object 
 2   Power demand  393440 non-null  float64
 3   temp          393440 non-null  float64
 4   dwpt          393440 non-null  float64
 5   rhum          393440 non-null  float64
 6   wdir          392900 non-null  float64
 7   wspd          393440 non-null  float64
 8   pres          393440 non-null  float64
 9   year          393440 non-null  int64  
 10  month         393440 non-null  int64  
 11  day           393440 non-null  int64  
 12  hour          393440 non-null  int64  
 13  minute        393440 non-null  int64  
 14  moving_avg_3  393438 non-null  float64
dtypes: float64(8), int64(6), object(1)
memory usage: 45.0+ MB
In [5]:
#changing the timestamp to a date time format
data['datetime'] = pd.to_datetime(data['datetime'])
In [6]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393440 entries, 0 to 393439
Data columns (total 15 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Unnamed: 0    393440 non-null  int64         
 1   datetime      393440 non-null  datetime64[ns]
 2   Power demand  393440 non-null  float64       
 3   temp          393440 non-null  float64       
 4   dwpt          393440 non-null  float64       
 5   rhum          393440 non-null  float64       
 6   wdir          392900 non-null  float64       
 7   wspd          393440 non-null  float64       
 8   pres          393440 non-null  float64       
 9   year          393440 non-null  int64         
 10  month         393440 non-null  int64         
 11  day           393440 non-null  int64         
 12  hour          393440 non-null  int64         
 13  minute        393440 non-null  int64         
 14  moving_avg_3  393438 non-null  float64       
dtypes: datetime64[ns](1), float64(8), int64(6)
memory usage: 45.0 MB
In [7]:
#dropping the first column which contains only numbers and not useful data
data = data.drop(columns=['Unnamed: 0'])
In [8]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393440 entries, 0 to 393439
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   datetime      393440 non-null  datetime64[ns]
 1   Power demand  393440 non-null  float64       
 2   temp          393440 non-null  float64       
 3   dwpt          393440 non-null  float64       
 4   rhum          393440 non-null  float64       
 5   wdir          392900 non-null  float64       
 6   wspd          393440 non-null  float64       
 7   pres          393440 non-null  float64       
 8   year          393440 non-null  int64         
 9   month         393440 non-null  int64         
 10  day           393440 non-null  int64         
 11  hour          393440 non-null  int64         
 12  minute        393440 non-null  int64         
 13  moving_avg_3  393438 non-null  float64       
dtypes: datetime64[ns](1), float64(8), int64(5)
memory usage: 42.0 MB
In [9]:
#setting the datetime to the index column
data = data.set_index('datetime')
In [10]:
data.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 393440 entries, 2021-01-01 00:30:00 to 2024-12-12 00:30:00
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Power demand  393440 non-null  float64
 1   temp          393440 non-null  float64
 2   dwpt          393440 non-null  float64
 3   rhum          393440 non-null  float64
 4   wdir          392900 non-null  float64
 5   wspd          393440 non-null  float64
 6   pres          393440 non-null  float64
 7   year          393440 non-null  int64  
 8   month         393440 non-null  int64  
 9   day           393440 non-null  int64  
 10  hour          393440 non-null  int64  
 11  minute        393440 non-null  int64  
 12  moving_avg_3  393438 non-null  float64
dtypes: float64(8), int64(5)
memory usage: 42.0 MB
In [11]:
data
Out[11]:
Power demand temp dwpt rhum wdir wspd pres year month day hour minute moving_avg_3
datetime
2021-01-01 00:30:00 2014.00 8.0 6.9 93.0 0.0 0.0 1017.0 2021 1 1 0 30 NaN
2021-01-01 00:35:00 2005.63 8.0 6.9 93.0 0.0 0.0 1017.0 2021 1 1 0 35 NaN
2021-01-01 00:40:00 1977.60 8.0 6.9 93.0 0.0 0.0 1017.0 2021 1 1 0 40 1999.076667
2021-01-01 00:45:00 1976.44 8.0 6.9 93.0 0.0 0.0 1017.0 2021 1 1 0 45 1986.556667
2021-01-01 00:50:00 1954.37 8.0 6.9 93.0 0.0 0.0 1017.0 2021 1 1 0 50 1969.470000
... ... ... ... ... ... ... ... ... ... ... ... ... ...
2024-12-12 00:10:00 2146.84 12.3 6.8 69.0 269.0 1.8 1019.4 2024 12 12 0 10 2174.893333
2024-12-12 00:15:00 2116.66 12.3 6.8 69.0 269.0 1.8 1019.4 2024 12 12 0 15 2139.416667
2024-12-12 00:20:00 2082.77 12.3 6.8 69.0 269.0 1.8 1019.4 2024 12 12 0 20 2115.423333
2024-12-12 00:25:00 2059.17 12.3 6.8 69.0 269.0 1.8 1019.4 2024 12 12 0 25 2086.200000
2024-12-12 00:30:00 2049.66 12.3 6.8 69.0 269.0 1.8 1019.4 2024 12 12 0 30 2063.866667

393440 rows × 13 columns

In [12]:
#summary statrictics of a few columns
data[['Power demand','temp','rhum']].describe()
Out[12]:
Power demand temp rhum
count 393440.000000 393440.000000 393440.000000
mean 3960.736469 25.527913 63.435767
std 1300.473773 7.981563 24.850663
min 1302.080000 4.000000 5.000000
25% 3074.900000 20.000000 44.000000
50% 3832.320000 27.000000 67.000000
75% 4870.465000 31.000000 84.000000
max 8631.530000 46.400000 100.000000
In [13]:
#exploring how many missing values there are
data.isnull().sum()
Out[13]:
Power demand      0
temp              0
dwpt              0
rhum              0
wdir            540
wspd              0
pres              0
year              0
month             0
day               0
hour              0
minute            0
moving_avg_3      2
dtype: int64
In [14]:
data = data.drop(columns=['wdir'])
In [16]:
#show the records that contain any null values
data[data.isna().any(axis=1)]
Out[16]:
Power demand temp dwpt rhum wspd pres year month day hour minute moving_avg_3
datetime
2021-01-01 00:30:00 2014.00 8.0 6.9 93.0 0.0 1017.0 2021 1 1 0 30 NaN
2021-01-01 00:35:00 2005.63 8.0 6.9 93.0 0.0 1017.0 2021 1 1 0 35 NaN
In [18]:
#drop all records that contain all missing values
data = data.dropna(how = 'all')
data[data.isna().all(axis=1)]
Out[18]:
Power demand temp dwpt rhum wspd pres year month day hour minute moving_avg_3
datetime
In [22]:
data.tail()
Out[22]:
Power demand temp dwpt rhum wspd pres year month day hour minute moving_avg_3
datetime
2024-12-12 00:10:00 2146.84 12.3 6.8 69.0 1.8 1019.4 2024 12 12 0 10 2174.893333
2024-12-12 00:15:00 2116.66 12.3 6.8 69.0 1.8 1019.4 2024 12 12 0 15 2139.416667
2024-12-12 00:20:00 2082.77 12.3 6.8 69.0 1.8 1019.4 2024 12 12 0 20 2115.423333
2024-12-12 00:25:00 2059.17 12.3 6.8 69.0 1.8 1019.4 2024 12 12 0 25 2086.200000
2024-12-12 00:30:00 2049.66 12.3 6.8 69.0 1.8 1019.4 2024 12 12 0 30 2063.866667
In [23]:
#making sure that the index is in the datetime formart
data.index = pd.to_datetime(data.index)
In [24]:
#adding the "dayofweek" and "dayofyear" columns
data['dayofweek'] = data.index.dayofweek + 1 #monday as 1 and Sunday as 7
data['dayofyear'] = data.index.dayofyear #numbering the days from day 1 to day 366
In [26]:
data.tail()
Out[26]:
Power demand temp dwpt rhum wspd pres year month day hour minute moving_avg_3 dayofweek dayofyear
datetime
2024-12-12 00:10:00 2146.84 12.3 6.8 69.0 1.8 1019.4 2024 12 12 0 10 2174.893333 4 347
2024-12-12 00:15:00 2116.66 12.3 6.8 69.0 1.8 1019.4 2024 12 12 0 15 2139.416667 4 347
2024-12-12 00:20:00 2082.77 12.3 6.8 69.0 1.8 1019.4 2024 12 12 0 20 2115.423333 4 347
2024-12-12 00:25:00 2059.17 12.3 6.8 69.0 1.8 1019.4 2024 12 12 0 25 2086.200000 4 347
2024-12-12 00:30:00 2049.66 12.3 6.8 69.0 1.8 1019.4 2024 12 12 0 30 2063.866667 4 347
In [31]:
#Feature engineering, creating of year quaters
data['quarter'] = data.index.quarter
data.head()
Out[31]:
Power demand temp dwpt rhum wspd pres year month day hour minute moving_avg_3 dayofweek dayofyear quarter
datetime
2021-01-01 00:30:00 2014.00 8.0 6.9 93.0 0.0 1017.0 2021 1 1 0 30 NaN 5 1 1
2021-01-01 00:35:00 2005.63 8.0 6.9 93.0 0.0 1017.0 2021 1 1 0 35 NaN 5 1 1
2021-01-01 00:40:00 1977.60 8.0 6.9 93.0 0.0 1017.0 2021 1 1 0 40 1999.076667 5 1 1
2021-01-01 00:45:00 1976.44 8.0 6.9 93.0 0.0 1017.0 2021 1 1 0 45 1986.556667 5 1 1
2021-01-01 00:50:00 1954.37 8.0 6.9 93.0 0.0 1017.0 2021 1 1 0 50 1969.470000 5 1 1
In [32]:
data.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 393440 entries, 2021-01-01 00:30:00 to 2024-12-12 00:30:00
Data columns (total 15 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Power demand  393440 non-null  float64
 1   temp          393440 non-null  float64
 2   dwpt          393440 non-null  float64
 3   rhum          393440 non-null  float64
 4   wspd          393440 non-null  float64
 5   pres          393440 non-null  float64
 6   year          393440 non-null  int64  
 7   month         393440 non-null  int64  
 8   day           393440 non-null  int64  
 9   hour          393440 non-null  int64  
 10  minute        393440 non-null  int64  
 11  moving_avg_3  393438 non-null  float64
 12  dayofweek     393440 non-null  int32  
 13  dayofyear     393440 non-null  int32  
 14  quarter       393440 non-null  int32  
dtypes: float64(7), int32(3), int64(5)
memory usage: 43.5 MB
In [34]:
#create another column for the "weekofyear"
data.insert(7,'weekofyear', data.index.isocalendar().week.astype(int))
In [42]:
data.head(1000)
Out[42]:
Power demand temp dwpt rhum wspd pres year weekofyear month day hour minute moving_avg_3 dayofweek dayofyear quarter
datetime
2021-01-01 00:30:00 2014.00 8.0 6.9 93.0 0.0 1017.0 2021 53 1 1 0 30 NaN 5 1 1
2021-01-01 00:35:00 2005.63 8.0 6.9 93.0 0.0 1017.0 2021 53 1 1 0 35 NaN 5 1 1
2021-01-01 00:40:00 1977.60 8.0 6.9 93.0 0.0 1017.0 2021 53 1 1 0 40 1999.076667 5 1 1
2021-01-01 00:45:00 1976.44 8.0 6.9 93.0 0.0 1017.0 2021 53 1 1 0 45 1986.556667 5 1 1
2021-01-01 00:50:00 1954.37 8.0 6.9 93.0 0.0 1017.0 2021 53 1 1 0 50 1969.470000 5 1 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2021-01-04 11:25:00 3505.59 16.0 14.0 88.0 9.4 1018.0 2021 1 1 4 11 25 3489.023333 1 4 1
2021-01-04 11:30:00 3471.75 17.8 16.5 92.0 7.6 1017.3 2021 1 1 4 11 30 3489.096667 1 4 1
2021-01-04 11:35:00 3487.28 17.8 16.5 92.0 7.6 1017.3 2021 1 1 4 11 35 3488.206667 1 4 1
2021-01-04 11:40:00 3501.53 17.8 16.5 92.0 7.6 1017.3 2021 1 1 4 11 40 3486.853333 1 4 1
2021-01-04 11:45:00 3481.15 17.8 16.5 92.0 7.6 1017.3 2021 1 1 4 11 45 3489.986667 1 4 1

1000 rows × 16 columns

In [43]:
#create anew column to check if the date is a weekend or not
data.insert(7, 'is_weekend', data.index.dayofweek.isin([6,7]))
In [45]:
data.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 393440 entries, 2021-01-01 00:30:00 to 2024-12-12 00:30:00
Data columns (total 17 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Power demand  393440 non-null  float64
 1   temp          393440 non-null  float64
 2   dwpt          393440 non-null  float64
 3   rhum          393440 non-null  float64
 4   wspd          393440 non-null  float64
 5   pres          393440 non-null  float64
 6   year          393440 non-null  int64  
 7   is_weekend    393440 non-null  bool   
 8   weekofyear    393440 non-null  int64  
 9   month         393440 non-null  int64  
 10  day           393440 non-null  int64  
 11  hour          393440 non-null  int64  
 12  minute        393440 non-null  int64  
 13  moving_avg_3  393438 non-null  float64
 14  dayofweek     393440 non-null  int32  
 15  dayofyear     393440 non-null  int32  
 16  quarter       393440 non-null  int32  
dtypes: bool(1), float64(7), int32(3), int64(6)
memory usage: 46.9 MB
In [46]:
data['is_weekend'] = data['is_weekend'].astype(int)
In [47]:
data.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 393440 entries, 2021-01-01 00:30:00 to 2024-12-12 00:30:00
Data columns (total 17 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Power demand  393440 non-null  float64
 1   temp          393440 non-null  float64
 2   dwpt          393440 non-null  float64
 3   rhum          393440 non-null  float64
 4   wspd          393440 non-null  float64
 5   pres          393440 non-null  float64
 6   year          393440 non-null  int64  
 7   is_weekend    393440 non-null  int64  
 8   weekofyear    393440 non-null  int64  
 9   month         393440 non-null  int64  
 10  day           393440 non-null  int64  
 11  hour          393440 non-null  int64  
 12  minute        393440 non-null  int64  
 13  moving_avg_3  393438 non-null  float64
 14  dayofweek     393440 non-null  int32  
 15  dayofyear     393440 non-null  int32  
 16  quarter       393440 non-null  int32  
dtypes: float64(7), int32(3), int64(7)
memory usage: 49.5 MB
In [49]:
#install and importing the holidays package for checking if the particular day falls on a public holiday or not
!pip install holidays
import holidays
Collecting holidays
  Downloading holidays-0.92-py3-none-any.whl.metadata (50 kB)
Requirement already satisfied: python-dateutil<3,>=2.9.0.post0 in c:\users\t\anaconda3\lib\site-packages (from holidays) (2.9.0.post0)
Requirement already satisfied: six>=1.5 in c:\users\t\anaconda3\lib\site-packages (from python-dateutil<3,>=2.9.0.post0->holidays) (1.17.0)
Downloading holidays-0.92-py3-none-any.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ---------------------------------------- 1.4/1.4 MB 10.4 MB/s  0:00:00
Installing collected packages: holidays
Successfully installed holidays-0.92
In [50]:
holidays.IN(years= data.year)
Out[50]:
{datetime.date(2024, 1, 26): 'Republic Day', datetime.date(2024, 8, 15): 'Independence Day', datetime.date(2024, 10, 2): 'Gandhi Jayanti', datetime.date(2024, 5, 23): 'Buddha Purnima', datetime.date(2024, 10, 31): 'Diwali', datetime.date(2024, 8, 26): 'Janmashtami', datetime.date(2024, 10, 12): 'Dussehra', datetime.date(2024, 4, 21): 'Mahavir Jayanti', datetime.date(2024, 3, 8): 'Maha Shivaratri', datetime.date(2024, 11, 15): 'Guru Nanak Jayanti', datetime.date(2024, 7, 17): 'Muharram', datetime.date(2024, 9, 16): 'Milad-un-Nabi', datetime.date(2024, 4, 11): 'Id-ul-Fitr', datetime.date(2024, 6, 17): 'Bakrid', datetime.date(2024, 3, 29): 'Good Friday', datetime.date(2024, 12, 25): 'Christmas', datetime.date(2021, 1, 26): 'Republic Day', datetime.date(2021, 8, 15): 'Independence Day', datetime.date(2021, 10, 2): 'Gandhi Jayanti', datetime.date(2021, 5, 26): 'Buddha Purnima', datetime.date(2021, 11, 4): 'Diwali', datetime.date(2021, 8, 30): 'Janmashtami', datetime.date(2021, 10, 15): 'Dussehra', datetime.date(2021, 4, 25): 'Mahavir Jayanti', datetime.date(2021, 3, 11): 'Maha Shivaratri', datetime.date(2021, 11, 19): 'Guru Nanak Jayanti', datetime.date(2021, 8, 20): 'Muharram', datetime.date(2021, 10, 19): 'Milad-un-Nabi', datetime.date(2021, 5, 14): 'Id-ul-Fitr', datetime.date(2021, 7, 21): 'Bakrid', datetime.date(2021, 4, 2): 'Good Friday', datetime.date(2021, 12, 25): 'Christmas', datetime.date(2022, 1, 26): 'Republic Day', datetime.date(2022, 8, 15): 'Independence Day', datetime.date(2022, 10, 2): 'Gandhi Jayanti', datetime.date(2022, 5, 16): 'Buddha Purnima', datetime.date(2022, 10, 24): 'Diwali', datetime.date(2022, 8, 19): 'Janmashtami', datetime.date(2022, 10, 5): 'Dussehra', datetime.date(2022, 4, 14): 'Mahavir Jayanti', datetime.date(2022, 3, 1): 'Maha Shivaratri', datetime.date(2022, 11, 8): 'Guru Nanak Jayanti', datetime.date(2022, 8, 9): 'Muharram', datetime.date(2022, 10, 9): 'Milad-un-Nabi', datetime.date(2022, 5, 3): 'Id-ul-Fitr', datetime.date(2022, 7, 10): 'Bakrid', datetime.date(2022, 4, 15): 'Good Friday', datetime.date(2022, 12, 25): 'Christmas', datetime.date(2023, 1, 26): 'Republic Day', datetime.date(2023, 8, 15): 'Independence Day', datetime.date(2023, 10, 2): 'Gandhi Jayanti', datetime.date(2023, 5, 5): 'Buddha Purnima', datetime.date(2023, 11, 12): 'Diwali', datetime.date(2023, 9, 7): 'Janmashtami', datetime.date(2023, 10, 24): 'Dussehra', datetime.date(2023, 4, 4): 'Mahavir Jayanti', datetime.date(2023, 2, 18): 'Maha Shivaratri', datetime.date(2023, 11, 27): 'Guru Nanak Jayanti', datetime.date(2023, 7, 29): 'Muharram', datetime.date(2023, 9, 28): 'Milad-un-Nabi', datetime.date(2023, 4, 22): 'Id-ul-Fitr', datetime.date(2023, 6, 29): 'Bakrid', datetime.date(2023, 4, 7): 'Good Friday', datetime.date(2023, 12, 25): 'Christmas'}
In [57]:
#saving the new data in its own column
data['Holidays'] = holidays.IN(years= data.year)
data.head(3)
Out[57]:
Power demand temp dwpt rhum wspd pres year is_weekend weekofyear month day hour minute moving_avg_3 dayofweek dayofyear quarter Holidays
datetime
2021-01-01 00:30:00 2014.00 8.0 6.9 93.0 0.0 1017.0 2021 0 53 1 1 0 30 NaN 5 1 1 NaN
2021-01-01 00:35:00 2005.63 8.0 6.9 93.0 0.0 1017.0 2021 0 53 1 1 0 35 NaN 5 1 1 NaN
2021-01-01 00:40:00 1977.60 8.0 6.9 93.0 0.0 1017.0 2021 0 53 1 1 0 40 1999.076667 5 1 1 NaN
In [58]:
# Dropping some columns we do not need
columns_to_drop = ['dwpt', 'wspd', 'pres', 'minute', 'day', 'moving_avg_3']

# Drop the columns
data.drop(columns=columns_to_drop, inplace=True)

# Check the result of the remaining columns
print(data.columns.tolist())
['Power demand', 'temp', 'rhum', 'year', 'is_weekend', 'weekofyear', 'month', 'hour', 'dayofweek', 'dayofyear', 'quarter', 'Holidays']
In [59]:
data.head()
Out[59]:
Power demand temp rhum year is_weekend weekofyear month hour dayofweek dayofyear quarter Holidays
datetime
2021-01-01 00:30:00 2014.00 8.0 93.0 2021 0 53 1 0 5 1 1 NaN
2021-01-01 00:35:00 2005.63 8.0 93.0 2021 0 53 1 0 5 1 1 NaN
2021-01-01 00:40:00 1977.60 8.0 93.0 2021 0 53 1 0 5 1 1 NaN
2021-01-01 00:45:00 1976.44 8.0 93.0 2021 0 53 1 0 5 1 1 NaN
2021-01-01 00:50:00 1954.37 8.0 93.0 2021 0 53 1 0 5 1 1 NaN
In [60]:
#Creating some laggged features
#These are past demand values: strong predictors which can be used as input to predict future values
#enables the models to learn from the historical data and improve from historical accurancy
In [61]:
#creating a demand lag feature from the same hour yesterday, and from the same hour last week
data['demand_lagged_24hr'] = data ['Power demand'].shift(24)
In [70]:
#creating another feature fro the demand lagged one week
#24hrs * 7 = 168 hours for a weekly shift
data['demand_lagged_168hr'] = data ['Power demand'].shift(186)
In [71]:
#creating two columns, one for the moving average and another for the standard deviation
In [72]:
data['demand_rolling_mean_24hr'] = data['Power demand'].rolling(window=24).mean()
In [76]:
data['demand_rolling_std_24hr'] = data['Power demand'].rolling(window=24).std()
In [ ]:
#dropping the holiday column since we will ont use it
data = data.drop('Holidays',axis=1)
In [80]:
#dropping all the rows having nan values
data = data.dropna()
data
Out[80]:
Power demand temp rhum year is_weekend weekofyear month hour dayofweek dayofyear quarter demand_lagged_24hr demand_lagged_168hr demand_rolling_mean_24hr demand_rolling_std_24hr
datetime
2021-01-01 16:00:00 3449.66 18.0 42.0 2021 0 53 1 16 5 1 1 3938.71 2014.00 3648.217083 159.107017
2021-01-01 16:05:00 3415.85 18.0 42.0 2021 0 53 1 16 5 1 1 3918.13 2005.63 3627.288750 155.042152
2021-01-01 16:10:00 3420.58 18.0 42.0 2021 0 53 1 16 5 1 1 3905.85 1977.60 3607.069167 148.645444
2021-01-01 16:15:00 3406.31 18.0 42.0 2021 0 53 1 16 5 1 1 3867.80 1976.44 3587.840417 143.200310
2021-01-01 16:20:00 3417.71 18.0 42.0 2021 0 53 1 16 5 1 1 3855.52 1954.37 3569.598333 135.285687
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2024-12-12 00:10:00 2146.84 12.3 69.0 2024 0 50 12 0 4 347 4 3446.53 3588.77 3120.540417 373.904837
2024-12-12 00:15:00 2116.66 12.3 69.0 2024 0 50 12 0 4 347 4 3402.37 3568.09 3066.969167 420.919382
2024-12-12 00:20:00 2082.77 12.3 69.0 2024 0 50 12 0 4 347 4 3348.17 3457.49 3014.244167 461.464483
2024-12-12 00:25:00 2059.17 12.3 69.0 2024 0 50 12 0 4 347 4 3311.41 3469.89 2962.067500 495.911949
2024-12-12 00:30:00 2049.66 12.3 69.0 2024 0 50 12 0 4 347 4 3297.00 3413.51 2910.095000 523.858573

393254 rows × 15 columns

In [83]:
#plot raw data over time (demand over time)
data['Power demand'].plot(figsize=(15,6), title = 'Electricity Demand Over Time')
plt.xlabel('Year')
plt.ylabel('Demand (in MW)')
plt.show()
No description has been provided for this image
In [85]:
#visualize the demand by hour of the day
plt.figure(figsize = (15,6))
sns.boxplot(data = data, x = 'hour', y = 'Power demand')
plt.title("Demand by the Hour")
plt.show()
No description has been provided for this image
In [87]:
#visualize the demand by month
plt.figure (figsize=(15,6))
sns.boxplot(data = data, x = 'month', y ='Power demand')
plt.title("Demand by Month")
plt.show()
No description has been provided for this image
In [92]:
plt.figure(figsize=(10,10))
sns.regplot(data=data, x='temp', y='Power demand', 
            scatter_kws={'alpha':0.5},
            line_kws={'color': 'black', 'linewidth': 2})
plt.title("Demand vs Temperature with Trend Line")
plt.show()
No description has been provided for this image
In [97]:
#showinng the correlation matrix between the different features of the dataframe
plt.figure(figsize=(12,8))
sns.heatmap(data.corr(), annot=True, fmt=".2f",cmap='YlGnBu')
plt.title('Correlation Matrix of Features')
plt.show()
No description has been provided for this image
In [99]:
#Building the prediction model, at the end of the day, we need to predict the power demand
In [100]:
#renaming the demand column 
data.rename(columns={'Power demand': 'demand'}, inplace=True)
In [101]:
data.head()
Out[101]:
demand temp rhum year is_weekend weekofyear month hour dayofweek dayofyear quarter demand_lagged_24hr demand_lagged_168hr demand_rolling_mean_24hr demand_rolling_std_24hr
datetime
2021-01-01 16:00:00 3449.66 18.0 42.0 2021 0 53 1 16 5 1 1 3938.71 2014.00 3648.217083 159.107017
2021-01-01 16:05:00 3415.85 18.0 42.0 2021 0 53 1 16 5 1 1 3918.13 2005.63 3627.288750 155.042152
2021-01-01 16:10:00 3420.58 18.0 42.0 2021 0 53 1 16 5 1 1 3905.85 1977.60 3607.069167 148.645444
2021-01-01 16:15:00 3406.31 18.0 42.0 2021 0 53 1 16 5 1 1 3867.80 1976.44 3587.840417 143.200310
2021-01-01 16:20:00 3417.71 18.0 42.0 2021 0 53 1 16 5 1 1 3855.52 1954.37 3569.598333 135.285687
In [106]:
#defining the target variable (Y) dependent variable
Y = data.demand
In [107]:
#defining the features (X) predictors/ features/ variables
X = data.drop('demand', axis = 1)
In [108]:
print(Y)
datetime
2021-01-01 16:00:00    3449.66
2021-01-01 16:05:00    3415.85
2021-01-01 16:10:00    3420.58
2021-01-01 16:15:00    3406.31
2021-01-01 16:20:00    3417.71
                        ...   
2024-12-12 00:10:00    2146.84
2024-12-12 00:15:00    2116.66
2024-12-12 00:20:00    2082.77
2024-12-12 00:25:00    2059.17
2024-12-12 00:30:00    2049.66
Name: demand, Length: 393254, dtype: float64
In [109]:
X
Out[109]:
temp rhum year is_weekend weekofyear month hour dayofweek dayofyear quarter demand_lagged_24hr demand_lagged_168hr demand_rolling_mean_24hr demand_rolling_std_24hr
datetime
2021-01-01 16:00:00 18.0 42.0 2021 0 53 1 16 5 1 1 3938.71 2014.00 3648.217083 159.107017
2021-01-01 16:05:00 18.0 42.0 2021 0 53 1 16 5 1 1 3918.13 2005.63 3627.288750 155.042152
2021-01-01 16:10:00 18.0 42.0 2021 0 53 1 16 5 1 1 3905.85 1977.60 3607.069167 148.645444
2021-01-01 16:15:00 18.0 42.0 2021 0 53 1 16 5 1 1 3867.80 1976.44 3587.840417 143.200310
2021-01-01 16:20:00 18.0 42.0 2021 0 53 1 16 5 1 1 3855.52 1954.37 3569.598333 135.285687
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2024-12-12 00:10:00 12.3 69.0 2024 0 50 12 0 4 347 4 3446.53 3588.77 3120.540417 373.904837
2024-12-12 00:15:00 12.3 69.0 2024 0 50 12 0 4 347 4 3402.37 3568.09 3066.969167 420.919382
2024-12-12 00:20:00 12.3 69.0 2024 0 50 12 0 4 347 4 3348.17 3457.49 3014.244167 461.464483
2024-12-12 00:25:00 12.3 69.0 2024 0 50 12 0 4 347 4 3311.41 3469.89 2962.067500 495.911949
2024-12-12 00:30:00 12.3 69.0 2024 0 50 12 0 4 347 4 3297.00 3413.51 2910.095000 523.858573

393254 rows × 14 columns

In [110]:
#splitting the data into training and testing data by 20% to 80%
X_train = X.loc[: '2024-02-29'] #taking 80% of the data as testing data
In [111]:
X_train
Out[111]:
temp rhum year is_weekend weekofyear month hour dayofweek dayofyear quarter demand_lagged_24hr demand_lagged_168hr demand_rolling_mean_24hr demand_rolling_std_24hr
datetime
2021-01-01 16:00:00 18.0 42.0 2021 0 53 1 16 5 1 1 3938.71 2014.00 3648.217083 159.107017
2021-01-01 16:05:00 18.0 42.0 2021 0 53 1 16 5 1 1 3918.13 2005.63 3627.288750 155.042152
2021-01-01 16:10:00 18.0 42.0 2021 0 53 1 16 5 1 1 3905.85 1977.60 3607.069167 148.645444
2021-01-01 16:15:00 18.0 42.0 2021 0 53 1 16 5 1 1 3867.80 1976.44 3587.840417 143.200310
2021-01-01 16:20:00 18.0 42.0 2021 0 53 1 16 5 1 1 3855.52 1954.37 3569.598333 135.285687
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2024-02-29 23:25:00 18.0 64.0 2024 0 9 2 23 4 60 1 2926.06 3766.91 2627.903750 168.803709
2024-02-29 23:30:00 18.0 75.0 2024 0 9 2 23 4 60 1 2894.40 3832.04 2605.032917 168.310756
2024-02-29 23:35:00 18.0 75.0 2024 0 9 2 23 4 60 1 2841.57 3787.23 2582.132500 172.076109
2024-02-29 23:40:00 18.0 75.0 2024 0 9 2 23 4 60 1 2865.73 3792.01 2557.773333 171.563203
2024-02-29 23:45:00 18.0 75.0 2024 0 9 2 23 4 60 1 2821.97 3833.37 2534.717083 171.696401

314928 rows × 14 columns

In [113]:
Y_train = Y.loc[:'2024-02-29'] #taking 80% of the data as the training data
In [123]:
X_test = X.loc['2024-03-01':] #20% of testing dataset
In [120]:
Y_test = Y.loc['2024-03-01':]
In [124]:
print (X_train.shape)
print(Y_train.shape)
(314928, 14)
(314928,)
In [125]:
print (X_test.shape)
print(Y_test.shape)
(78326, 14)
(78326,)
In [128]:
#XGBoost Model
#XGboost used for time series forecasting, as it can handle non-linear data easily.
#it can handle seasonality and trends in time series data effectively.
In [130]:
!pip install xgboost
from xgboost import XGBRegressor
Collecting xgboost
  Downloading xgboost-3.2.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Requirement already satisfied: numpy in c:\users\t\anaconda3\lib\site-packages (from xgboost) (2.3.5)
Requirement already satisfied: scipy in c:\users\t\anaconda3\lib\site-packages (from xgboost) (1.16.3)
Downloading xgboost-3.2.0-py3-none-win_amd64.whl (101.7 MB)
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.3/101.7 MB ? eta -:--:--
   - -------------------------------------- 3.4/101.7 MB 13.6 MB/s eta 0:00:08
   -- ------------------------------------- 5.8/101.7 MB 12.8 MB/s eta 0:00:08
   --- ------------------------------------ 7.9/101.7 MB 11.6 MB/s eta 0:00:09
   ---- ----------------------------------- 10.5/101.7 MB 11.7 MB/s eta 0:00:08
   ----- ---------------------------------- 13.1/101.7 MB 11.8 MB/s eta 0:00:08
   ----- ---------------------------------- 14.9/101.7 MB 11.6 MB/s eta 0:00:08
   ----- ---------------------------------- 15.2/101.7 MB 9.8 MB/s eta 0:00:09
   ------- -------------------------------- 19.1/101.7 MB 10.8 MB/s eta 0:00:08
   -------- ------------------------------- 22.0/101.7 MB 11.1 MB/s eta 0:00:08
   --------- ------------------------------ 24.4/101.7 MB 11.1 MB/s eta 0:00:07
   ---------- ----------------------------- 26.2/101.7 MB 10.8 MB/s eta 0:00:07
   ----------- ---------------------------- 28.8/101.7 MB 11.0 MB/s eta 0:00:07
   ------------ --------------------------- 31.5/101.7 MB 11.1 MB/s eta 0:00:07
   ------------- -------------------------- 34.3/101.7 MB 11.3 MB/s eta 0:00:06
   -------------- ------------------------- 37.2/101.7 MB 11.4 MB/s eta 0:00:06
   --------------- ------------------------ 39.3/101.7 MB 11.3 MB/s eta 0:00:06
   ---------------- ----------------------- 41.9/101.7 MB 11.4 MB/s eta 0:00:06
   ----------------- ---------------------- 44.3/101.7 MB 11.3 MB/s eta 0:00:06
   ------------------ --------------------- 46.7/101.7 MB 11.3 MB/s eta 0:00:05
   ------------------- -------------------- 49.0/101.7 MB 11.3 MB/s eta 0:00:05
   -------------------- ------------------- 51.4/101.7 MB 11.3 MB/s eta 0:00:05
   --------------------- ------------------ 54.0/101.7 MB 11.3 MB/s eta 0:00:05
   ---------------------- ----------------- 56.4/101.7 MB 11.3 MB/s eta 0:00:05
   ----------------------- ---------------- 59.2/101.7 MB 11.4 MB/s eta 0:00:04
   ------------------------ --------------- 61.9/101.7 MB 11.4 MB/s eta 0:00:04
   ------------------------- -------------- 64.7/101.7 MB 11.5 MB/s eta 0:00:04
   -------------------------- ------------- 67.1/101.7 MB 11.5 MB/s eta 0:00:04
   -------------------------- ------------- 67.4/101.7 MB 11.5 MB/s eta 0:00:03
   ---------------------------- ----------- 71.6/101.7 MB 11.4 MB/s eta 0:00:03
   ---------------------------- ----------- 73.7/101.7 MB 11.4 MB/s eta 0:00:03
   ------------------------------ --------- 76.5/101.7 MB 11.4 MB/s eta 0:00:03
   ------------------------------- -------- 78.9/101.7 MB 11.4 MB/s eta 0:00:02
   ------------------------------- -------- 80.7/101.7 MB 11.3 MB/s eta 0:00:02
   -------------------------------- ------- 82.6/101.7 MB 11.2 MB/s eta 0:00:02
   -------------------------------- ------- 83.9/101.7 MB 11.1 MB/s eta 0:00:02
   --------------------------------- ------ 85.7/101.7 MB 11.0 MB/s eta 0:00:02
   ---------------------------------- ----- 88.3/101.7 MB 11.0 MB/s eta 0:00:02
   ----------------------------------- ---- 91.2/101.7 MB 11.1 MB/s eta 0:00:01
   ------------------------------------ --- 93.8/101.7 MB 11.1 MB/s eta 0:00:01
   ------------------------------------- -- 95.7/101.7 MB 11.1 MB/s eta 0:00:01
   -------------------------------------- - 98.3/101.7 MB 11.1 MB/s eta 0:00:01
   --------------------------------------  100.9/101.7 MB 11.1 MB/s eta 0:00:01
   --------------------------------------  101.4/101.7 MB 11.1 MB/s eta 0:00:01
   ---------------------------------------- 101.7/101.7 MB 10.8 MB/s  0:00:09
Installing collected packages: xgboost
Successfully installed xgboost-3.2.0
In [135]:
#from sklearn library importing two fundamental metrics to calculate the performance of the model
from sklearn.metrics import mean_squared_error, mean_absolute_error
#from model_selection module of scikit-learn's library, import timeseries split class
#which perfomrs appropriate cross validation when dealing with time series data
from sklearn.model_selection import TimeSeriesSplit
In [136]:
#Initializing and training the model
model_xgb = XGBRegressor(n_estimators=1000,
                        early_stopping_rounds=50,
                        learning_rate=0.01,
                        random_state=42,
                        objective='reg:squarederror')
In [137]:
model_xgb.fit(X_train, Y_train, eval_set = [(X_train, Y_train), (X_test,Y_test)], verbose=False)
Out[137]:
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=50,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.01, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=1000,
             n_jobs=None, num_parallel_tree=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
objective  'reg:squarederror'
base_score  None
booster  None
callbacks  None
colsample_bylevel  None
colsample_bynode  None
colsample_bytree  None
device  None
early_stopping_rounds  50
enable_categorical  False
eval_metric  None
feature_types  None
feature_weights  None
gamma  None
grow_policy  None
importance_type  None
interaction_constraints  None
learning_rate  0.01
max_bin  None
max_cat_threshold  None
max_cat_to_onehot  None
max_delta_step  None
max_depth  None
max_leaves  None
min_child_weight  None
missing  nan
monotone_constraints  None
multi_strategy  None
n_estimators  1000
n_jobs  None
num_parallel_tree  None
random_state  42
reg_alpha  None
reg_lambda  None
sampling_method  None
scale_pos_weight  None
subsample  None
tree_method  None
validate_parameters  None
verbosity  None
In [138]:
#making predictions
predictions_xgb = model_xgb.predict(X_test)
In [139]:
#evaluating the model
rmse_xgb = np.sqrt(mean_squared_error(Y_test,predictions_xgb))
mae_xgb = mean_absolute_error(Y_test,predictions_xgb)
In [141]:
print('XGBoost RMSE:', rmse_xgb)
print('XGBoost MAE:', mae_xgb)
XGBoost RMSE: 191.6282277687719
XGBoost MAE: 88.88229013144982
In [152]:
#visualize the predictions
plt.figure(figsize = (15,6))
plt.plot(Y_test.index,Y_test, label= 'Actual demand')
plt.plot(Y_test.index, predictions_xgb, label='Predicted Demand', color = 'Red')
plt.title('XGBoost Electricity demand Prediction')
plt.xlabel('Date')
plt.ylabel('Power Demand')
plt.legend()
plt.show()
No description has been provided for this image
In [153]:
#saving the model
import joblib
joblib.dump(model_xgb, 'electricity_xbg_prediction_model.pkl')
Out[153]:
['electricity_xbg_prediction_model.pkl']
In [ ]: