#Setting Working Directory
setwd("C:/Users/Capstone Project")
getwd()
## [1] "C:/Users/Capstone Project"
#Importing Dataset
library(readr)
flight_dt <- read_csv("Aviation Data/Marketing Project_Flight_data.csv")
## Parsed with column specification:
## cols(
## CustomerID = col_double(),
## Gender = col_character(),
## CustomerType = col_character(),
## Age = col_double(),
## TypeTravel = col_character(),
## Class = col_character(),
## Flight_Distance = col_double(),
## DepartureDelayin_Mins = col_double(),
## ArrivalDelayin_Mins = col_double()
## )
survey_dt <- read_csv("Aviation Data/Marketing Project_Survey_data.csv")
## Parsed with column specification:
## cols(
## CustomerId = col_double(),
## Satisfaction = col_character(),
## Seat_comfort = col_character(),
## Departure.Arrival.time_convenient = col_character(),
## Food_drink = col_character(),
## Gate_location = col_character(),
## Inflightwifi_service = col_character(),
## Inflight_entertainment = col_character(),
## Online_support = col_character(),
## Ease_of_Onlinebooking = col_character(),
## Onboard_service = col_character(),
## Leg_room_service = col_character(),
## Baggage_handling = col_character(),
## Checkin_service = col_character(),
## Cleanliness = col_character(),
## Online_boarding = col_character()
## )
library(dplyr)
Aviation = left_join(flight_dt, survey_dt, by = c("CustomerID" = "CustomerId"))
clean_names(Aviation)
## # A tibble: 90,917 x 24
## customer_id gender customer_type age type_travel class flight_distance
## <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 149965 Female Loyal Custom~ 65 Personal T~ Eco 265
## 2 149966 Female Loyal Custom~ 15 Personal T~ Eco 2138
## 3 149967 Female Loyal Custom~ 60 Personal T~ Eco 623
## 4 149968 Female Loyal Custom~ 70 Personal T~ Eco 354
## 5 149969 Male Loyal Custom~ 30 <NA> Eco 1894
## 6 149970 Female Loyal Custom~ 66 Personal T~ Eco 227
## 7 149971 Male Loyal Custom~ 10 Personal T~ Eco 1812
## 8 149972 Male Loyal Custom~ 22 Personal T~ Eco 1556
## 9 149973 Female Loyal Custom~ 58 Personal T~ Eco 104
## 10 149974 Female Loyal Custom~ 34 Personal T~ Eco 3633
## # ... with 90,907 more rows, and 17 more variables:
## # departure_delayin_mins <dbl>, arrival_delayin_mins <dbl>,
## # satisfaction <chr>, seat_comfort <chr>,
## # departure_arrival_time_convenient <chr>, food_drink <chr>,
## # gate_location <chr>, inflightwifi_service <chr>,
## # inflight_entertainment <chr>, online_support <chr>,
## # ease_of_onlinebooking <chr>, onboard_service <chr>,
## # leg_room_service <chr>, baggage_handling <chr>, checkin_service <chr>,
## # cleanliness <chr>, online_boarding <chr>
Aviation = Aviation[-c(1)] #column 1 was removed on line 63, now remove Gender column
Aviation <- mutate_if(Aviation, is.character, as.factor)
## Remove Customer Type and Type Travel Column from the Analysis as they both containing 10% missing values.
Aviation = Aviation[-c(2,4)]
## imputing the mean into missing values for Arrival/Departure Delays in Minutes
Aviation$ArrivalDelayin_Mins[is.na(Aviation$ArrivalDelayin_Mins)] = mean(Aviation$ArrivalDelayin_Mins, na.rm = TRUE)
## Create mode function
dt_mode <- function(x) {
unique_x <- unique(x)
mode <- unique_x[which.max(tabulate(match(x, unique_x)))]
mode
}
# imputing the mode into surveyed missing values
Aviation$Departure.Arrival.time_convenient[is.na(Aviation$Departure.Arrival.time_convenient)] <- dt_mode(Aviation$Departure.Arrival.time_convenient[!is.na(Aviation$Departure.Arrival.time_convenient)])
Aviation$Food_drink[is.na(Aviation$Food_drink)] <- dt_mode(Aviation$Food_drink[!is.na(Aviation$Food_drink)])
Aviation$Onboard_service[is.na(Aviation$Onboard_service)] <- dt_mode(Aviation$Onboard_service[!is.na(Aviation$Onboard_service)])
# Converting character variables to integers
Aviation$Gender <- as.numeric(ifelse(Aviation$Gender == "Male", 1, 0))
Aviation$Class <- as.numeric(ifelse(Aviation$Class == "Business",1,
ifelse(Aviation$Class == "Eco Plus",2, 3 )))
Aviation[c(8:21)] <- as.numeric(ifelse(Aviation[c(8:21)] == "extremely poor", 0,
ifelse(Aviation[c(8:21)] == "poor", 1,
ifelse(Aviation[c(8:21)] == "need improvement", 2,
ifelse(Aviation[c(8:21)] == "acceptable", 3,
ifelse(Aviation[c(8:21)] == "good", 4, 5))))))
Label Conversion and Convert dataset to Dataframe
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
# Convert the Satisfaction factor to an integer class starting at 0
# This is picky, but it's a requirement for XGBoost
xgbAvi = Aviation
xgbAvi$Prediction = xgbAvi$Satisfaction
xgbAvi = xgbAvi[-c(7)]
xgbAvi$Satisfaction = xgbAvi$Prediction
xgbAvi = xgbAvi[-c(21)]
xgbAvi = as.data.frame(xgbAvi) #Converting dataset to dataframe
str(xgbAvi)
## 'data.frame': 90917 obs. of 21 variables:
## $ Gender : num 0 0 0 0 1 0 1 1 0 0 ...
## $ Age : num 65 15 60 70 30 66 10 22 58 34 ...
## $ Class : num 3 3 3 3 3 3 3 3 3 3 ...
## $ Flight_Distance : num 265 2138 623 354 1894 ...
## $ DepartureDelayin_Mins : num 0 0 0 0 0 17 0 30 47 0 ...
## $ ArrivalDelayin_Mins : num 0 0 0 0 0 15 0 26 48 0 ...
## $ Seat_comfort : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Departure.Arrival.time_convenient: num 0 0 4 0 0 0 0 4 0 0 ...
## $ Food_drink : num 0 0 0 0 0 3 3 0 0 0 ...
## $ Gate_location : num 2 5 5 5 5 5 5 5 5 5 ...
## $ Inflightwifi_service : num 2 2 3 4 2 2 2 2 3 2 ...
## $ Inflight_entertainment : num 4 0 4 3 0 5 0 0 3 0 ...
## $ Online_support : num 2 2 3 4 2 5 2 2 3 2 ...
## $ Ease_of_Onlinebooking : num 3 2 1 2 2 5 2 2 3 2 ...
## $ Onboard_service : num 3 4 1 2 5 5 3 2 3 3 ...
## $ Leg_room_service : num 0 3 0 0 4 0 3 4 0 2 ...
## $ Baggage_handling : num 3 4 1 2 5 5 4 5 1 5 ...
## $ Checkin_service : num 5 4 4 4 5 5 5 3 2 2 ...
## $ Cleanliness : num 3 4 1 2 4 5 4 4 3 5 ...
## $ Online_boarding : num 2 2 3 5 2 3 2 2 5 2 ...
## $ Satisfaction : Factor w/ 2 levels "neutral or dissatisfied",..: 2 2 2 2 2 2 2 2 2 2 ...
Split the data for training and testing (75/25 split)
#### **Split the data for training and testing (75/25 split)**
indexes = createDataPartition(xgbAvi$Satisfaction, p = 0.75, list = F)
train = xgbAvi[indexes, ]
test = xgbAvi[-indexes, ]
train.data = data.matrix(train[,-21])
train.label = train[,21]
test.data = data.matrix(test[,-21])
test.label = test[,21]
Create the xgb.DMatrix objects
# Transform the two data sets into xgb.Matrix
xgb.train = xgb.DMatrix(data=train.data,label=train.label)
xgb.test = xgb.DMatrix(data=test.data,label=test.label)
Define the model
# We can define the xgboost model with xgboost function with changing some of the parameters. Note that xgboost is a training function, thus we need to include the train data too. Once we run the function, it fits the model with training data.
xgbModel = xgboost(data = xgb.train, max.depth=3, nrounds=50)
## [1] train-rmse:0.844626
## [2] train-rmse:0.634427
## [3] train-rmse:0.499091
## [4] train-rmse:0.413770
## [5] train-rmse:0.362687
## [6] train-rmse:0.330878
## [7] train-rmse:0.312292
## [8] train-rmse:0.301877
## [9] train-rmse:0.296154
## [10] train-rmse:0.292354
## [11] train-rmse:0.288608
## [12] train-rmse:0.285500
## [13] train-rmse:0.284341
## [14] train-rmse:0.282622
## [15] train-rmse:0.281670
## [16] train-rmse:0.280383
## [17] train-rmse:0.279421
## [18] train-rmse:0.278295
## [19] train-rmse:0.277515
## [20] train-rmse:0.276470
## [21] train-rmse:0.275707
## [22] train-rmse:0.274822
## [23] train-rmse:0.274468
## [24] train-rmse:0.273616
## [25] train-rmse:0.273212
## [26] train-rmse:0.272497
## [27] train-rmse:0.271886
## [28] train-rmse:0.271041
## [29] train-rmse:0.270028
## [30] train-rmse:0.269354
## [31] train-rmse:0.268568
## [32] train-rmse:0.268209
## [33] train-rmse:0.267856
## [34] train-rmse:0.267388
## [35] train-rmse:0.266989
## [36] train-rmse:0.266528
## [37] train-rmse:0.265093
## [38] train-rmse:0.263613
## [39] train-rmse:0.262243
## [40] train-rmse:0.261973
## [41] train-rmse:0.261679
## [42] train-rmse:0.261491
## [43] train-rmse:0.261183
## [44] train-rmse:0.260925
## [45] train-rmse:0.260741
## [46] train-rmse:0.260610
## [47] train-rmse:0.260008
## [48] train-rmse:0.259328
## [49] train-rmse:0.259093
## [50] train-rmse:0.258834
print(xgbModel)
## ##### xgb.Booster
## raw: 33.8 Kb
## call:
## xgb.train(params = params, data = dtrain, nrounds = nrounds,
## watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
## early_stopping_rounds = early_stopping_rounds, maximize = maximize,
## save_period = save_period, save_name = save_name, xgb_model = xgb_model,
## callbacks = callbacks, max.depth = 3)
## params (as set within xgb.train):
## max_depth = "3", silent = "1"
## xgb.attributes:
## niter
## callbacks:
## cb.print.evaluation(period = print_every_n)
## cb.evaluation.log()
## # of features: 20
## niter: 50
## nfeatures : 20
## evaluation_log:
## iter train_rmse
## 1 0.844626
## 2 0.634427
## ---
## 49 0.259093
## 50 0.258834
Predicting test data
# The model is ready and we can predict our test data.
xbgPreds = predict(xgbModel, xgb.test)
# Now, we'll convert the result into factor level type.
xbgPreds[(xbgPreds > 2)] = 2
pred.label = as.factor((levels(test.label))[round(xbgPreds)])
Predict New Outcomes
# check the prediction accuracy with a confusion matrix.
conf_mtrx = confusionMatrix(test.label, pred.label)
print(conf_mtrx)
## Confusion Matrix and Statistics
##
## Reference
## Prediction neutral or dissatisfied satisfied
## neutral or dissatisfied 9264 1025
## satisfied 1007 11433
##
## Accuracy : 0.9106
## 95% CI : (0.9068, 0.9143)
## No Information Rate : 0.5481
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8196
##
## Mcnemar's Test P-Value : 0.7061
##
## Sensitivity : 0.9020
## Specificity : 0.9177
## Pos Pred Value : 0.9004
## Neg Pred Value : 0.9191
## Prevalence : 0.4519
## Detection Rate : 0.4076
## Detection Prevalence : 0.4527
## Balanced Accuracy : 0.9098
##
## 'Positive' Class : neutral or dissatisfied
##