Skip to main content

Feature Engineering

 Feature Engineering is the way of extracting features from data and transforming them into formats that are suitable for Machine Learning algorithms.


It is divided into 3broad categories:-

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
pd.set_option('display.max_columns', None)data_train = pd.read_excel('Data_Train.xlsx')data_test = pd.read_excel('Data_Test.xlsx')
price_train = data_train.Price  # Concatenate training and test sets data = pd.concat([data_train.drop(['Price'], axis=1), data_test])
Index(['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Route','Dep_Time', 'Arrival_Time', 'Duration', 'Total_Stops',
'Additional_Info', 'Price'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 13354 entries, 0 to 2670
Data columns (total 10 columns):
Airline 13354 non-null object
Date_of_Journey 13354 non-null object
Source 13354 non-null object
Destination 13354 non-null object
Route 13353 non-null object
Dep_Time 13354 non-null object
Arrival_Time 13354 non-null object
Duration 13354 non-null object
Total_Stops 13353 non-null object
Additional_Info 13354 non-null object
dtypes: int64(1), object(10)
memory usage: 918.1+ KB
data = data.drop_duplicates()
Airline            0
Date_of_Journey 0
Source 0
Destination 0
Route 1
Dep_Time 0
Arrival_Time 0
Duration 0
Total_Stops 1
Additional_Info 0
Price 0
dtype: int64
data = data.drop(data.loc[data['Route'].isnull()].index)

Airlines

sns.countplot(x='Airline', data=data)plt.xticks(rotation=90)
data['Airline'] = np.where(data['Airline']=='Vistara Premium economy', 'Vistara', data['Airline'])data['Airline'] = np.where(data['Airline']=='Jet Airways Business', 'Jet Airways', data['Airline'])data['Airline'] = np.where(data['Airline']=='Multiple carriers Premium economy', 'Multiple carriers', data['Airline'])

Flight’s Destination

data['Destination'].unique()data['Destination'] = np.where(data['Destination']=='Delhi','New Delhi', data['Destination'])

Date of Journey

24/03/2019
1/05/2019
data['Date_of_Journey'] = pd.to_datetime(data['Date_of_Journey'])OUTPUT
2019-03-24
2019-01-05
data['day_of_week'] = data['Date_of_Journey'].dt.day_name()OUTPUT
Sunday
Saturday
data['Journey_Month'] = pd.to_datetime(data.Date_of_Journey, format='%d/%m/%Y').dt.month_name()OUTPUT
March
January

Departure Time of Airlines

data['Departure_t'] = pd.to_datetime(data.Dep_Time, format='%H:%M')a = data.assign(dept_session=pd.cut(data.Departure_t.dt.hour,[0,6,12,18,24],labels=['Night','Morning','Afternoon','Evening']))data['Departure_S'] = a['dept_session']
data['Departure_S'].fillna("Night", inplace = True)

Duration

duration = list(data['Duration'])for i in range(len(duration)) :
if len(duration[i].split()) != 2:
if 'h' in duration[i] :
duration[i] = duration[i].strip() + ' 0m'
elif 'm' in duration[i] :
duration[i] = '0h {}'.format(duration[i].strip())
dur_hours = []
dur_minutes = []

for i in range(len(duration)) :
dur_hours.append(int(duration[i].split()[0][:-1]))
dur_minutes.append(int(duration[i].split()[1][:-1]))


data['Duration_hours'] = dur_hours
data['Duration_minutes'] =dur_minutes
data.loc[:,'Duration_hours'] *= 60data['Duration_Total_mins']= data['Duration_hours']+data['Duration_minutes']
# Get names of indexes for which column Age has value 30indexNames = data[data.Duration_Total_mins < 60].index# Delete these row indexes from dataFramedata.drop(indexNames , inplace=True)
data.drop(labels = ['Arrival_Time','Dep_Time','Date_of_Journey','Duration','Departure_t','Duration_hours','Duration_minutes'], axis=1, inplace = True)

Dummy Variables

cat_vars = ['Airline', 'Source', 'Destination', 'Route', 'Total_Stops',
'Additional_Info', 'day_of_week', 'Journey_Month', 'Departure_S' ]
for var in cat_vars:
catList = 'var'+'_'+var
catList = pd.get_dummies(data[var], prefix=var)
data1 = data.join(catList)
data = data1

data_vars = data.columns.values.tolist()
to_keep = [i for i in data_vars if i not in cat_vars]
data_final=data[to_keep]


Comments

Popular posts from this blog

Feature Importance

 Feature Importance for regression from sklearn . datasets import make _ regression # define dataset X , y = make_regression ( n_samples = 1000 , n_features = 10 , n_informative = 5 , random_state = 1 ) # summarize the dataset print ( X . shape , y . shape ) #linear regression feature importance from sklearn . datasets import make_regression from sklearn . linear_model import LinearRegression from matplotlib import pyplot # define dataset X , y = make_regression ( n_samples = 1000 , n_features = 10 , n_informative = 5 , random_state = 1 ) # define the model model = LinearRegression ( ) # fit the model model . fit ( X , y ) # get importance importance = model . coef _ # summarize feature importance for i , v in enumerate ( importance ) : print ( 'Feature: %0d, Score: %.5f' % ( i , v ) ) # plot feature importance pyplot . bar ( [ x for x in range ( len ( importance ) ) ] , importance ) pyplot . show ( ) feature importance for classification # test cl...