In this project, I will create a predictive model to predict whether a customer would complete a coupon or not when they first view it.
This model is made by users who don't have any null value in their attributes, and also users who viewed BOGO or discount coupons at least 3 times.
(Even though a customer does not view or display the coupon in the shop, the coupon can be automatically completed once he spends the amount needed.)
This model uses mainly 4 kinds of features for prediction.
1. Load in files
2. Clean the data
3. Univariate Exploration
4. Multivariate Exploration
5. Make a dataframe whose each row represents each coupon sent in this observation period
6. Basic Feature Engineering
1. Load in files
2. Feature engineering 1 (make columns representing the distribution of customer's past viewing time for coupons)
3. Feature engineering 2 (make a column representing the customer's past total completion rate)
4. Feature engineering 3 (make columns representing the customer's past total completion rate in 4 situation)
5. Create a model
6. Interpretation of the model
The data is contained in three files:
Here is the schema and explanation of each variable in the files:
portfolio.json
profile.json
transcript.json
import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import seaborn as sns
import time
% matplotlib inline
# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)
pd.options.display.max_rows = 100
pd.set_option("display.max_columns", 500)
portfolio
# copy the dataframes
profile2 = profile.copy()
print(profile2.shape)
profile2.head()
# replace 118 in age column with nan
profile2.age = profile.age.replace(118, np.nan)
# convert "became_member_on" column to datetime type
profile2.became_member_on = pd.to_datetime(profile2.became_member_on, format="%Y%m%d")
# one_hot_encode "gender" column
profile2 = profile2.join(pd.get_dummies(profile2.gender,prefix="gender",dummy_na=True))
profile2 = profile2.drop("gender", axis = 1)
# replace values in "gender_F","gender_M","gender_O" clumns with nan if the gender is not unclear
for i, if_nan in enumerate(profile2.gender_nan):
if if_nan == 1:
profile2.loc[i, "gender_F"] = np.nan
profile2.loc[i, "gender_M"] = np.nan
profile2.loc[i, "gender_O"] = np.nan
profile2 = profile2.drop("gender_nan", 1)
profile2.info()
# the numbere of customers whose ALL of age, income, gender are missed.
print((profile2.isnull().sum(axis=1) == 5).sum())
print(transcript.shape)
transcript.sample(10)
transcript2 = transcript.copy()
# remove "offer_id", "offer id", "amount" and strip each cell value (since they are clear.)
spl_list = []
for each_value in transcript2.value:
try :
v = str(each_value).split(": ")[1]
#print(v)
if v[0].isdigit():
v = v[:-1]
elif v[0] == "'":
v = v.split("'")[1]
except Exception as e:
print(e, v)
spl_list.append(v)
transcript2.value = spl_list
# divide the column "value" into 2 columns which are "id" (for offer_received,offer_viewed,offer_completed)
# and "reward" (for transaction)
l_reward = []
l_offer = []
for each_value in transcript2.value:
try:
each_value = float(each_value)
l_reward.append(each_value)
l_offer.append(np.nan)
except:
l_reward.append(np.nan)
l_offer.append(each_value)
transcript2["transaction"] = l_reward
transcript2["id"] = l_offer
transcript2 = transcript2.drop("value", axis=1)
print(transcript2.shape)
transcript2.head()
transcript2.info()
name_mapping = {
"ae264e3637204a6fb9bb56bc8210ddfd" : "bogo_7days$10",
"4d5c57ea9a6940dd891ad53e9dbe8da0" : "bogo_5days$10",
"3f207df678b143eea3cee63160fa8bed" : "info_4days",
"9b98b8c7a33c4b65b9aebfe6a799e6d9" : "bogo_7days$5",
"0b1e1539f2cc45b7b9fa7c272da2e1d7" : "disc_10days$20-$5",
"2298d6c36e964ae4a3e7e9706d1fb8c2" : "disc_7days$7-$3",
"fafdcd668e3743c1bb461111dcafc2a4" : "disc_10days$10-$2",
"5a8bc65990b245e5a138643cd4eb9837" : "info_3days",
"f19421c1d4aa40978ebb69ca19b0e20d" : "bogo_5days$5",
"2906b810c7d4411798c6938adc9daaa5" : "disc_7days$10-$2"
}
portfolio2 = portfolio.copy().replace(name_mapping)
transcript2 = transcript2.replace(name_mapping)
# Save these dataflame to excel file
portfolio2.to_excel("data/portfolio_clean.xlsx", index=False)
transcript2.to_excel("data/transcript_clean.xlsx", index=False)
profile2.to_excel("data/profile_clean.xlsx", index=False)
# Save these dataframe to csv file
portfolio2.to_csv("data/portfolio_clean.csv", index=False)
transcript2.to_csv("data/transcript_clean.csv", index=False)
profile2.to_csv("data/profile_clean.csv", index=False)
portfolio2.sort_values(["id","difficulty"])
profile2.describe()
plt.rcParams["font.size"] = 18
plt.tight_layout()
plt.figure(figsize=[20,15])
plt.subplot(3,2,1)
plt.title("Distribution of age")
profile2.age.plot.hist(bins=20, color="DarkBlue")
plt.subplot(3,2,2)
plt.title("Distribution of gender")
profile.gender.value_counts(dropna=False).plot(kind="bar", color="DarkBlue")
plt.subplot(3,2,3)
plt.title("Distribution of income")
profile.income.plot("hist", bins=np.arange(0,profile2.income.max(),1000), color="DarkBlue")
plt.rcParams["font.size"] = 10
plt.tight_layout()
plt.subplot(3,2,4)
plt.title("Distribution of registration year-month")
profile2.groupby([profile2.became_member_on.dt.year, profile2.became_member_on.dt.month])["became_member_on"].count().plot.bar(color="DarkBlue")
plt.subplot(3,2,5)
plt.title("month")
profile2.groupby([profile2.became_member_on.dt.month])["became_member_on"].count().plot(kind="bar", color="DarkBlue");
profile2.became_member_on.dt.year.value_counts()
# Take a look at a customer's all events in a month
transcript2[transcript2.person == "db691240326b44d8bfa7b438c4c41982"]
transcript2.describe()
This suggests that around 75% of coupons sent to customers are viewed, and around 75% of coupons sent to customers are completed.
plt.figure(figsize=[20,7])
# Distribution of of events
plt.subplot(1,2,1)
plt.title("Distribution of customers' events which occurred")
plt.xlabel("Events type")
plt.ylabel("Frequency of Occuring")
transcript2.event.value_counts().plot("bar", color = "DarkBlue")
# Distribution of offers issued
plt.subplot(1,2,2)
transcript2.merge(portfolio2, on="id").id.value_counts().plot("bar", color = "DarkBlue")
plt.title("Distribution of types of coupons sent to customers")
plt.xlabel("Coupon type")
plt.ylabel("Frequency of Sending")
transcript2.event.value_counts()
# Distribution of the amount of each of 138953 transactions
plt.title("Distribution of amount of money spent in 1 transaction")
plt.yscale("log")
transcript2.transaction.plot("hist", bins=500);
plt.xlabel("Amount of money a person has spent")
plt.ylabel("Number of persons");
# Distribution of monthly total transaction amount by a customer
plt.title("monthly total transaction amount of each customer")
transcript2.groupby("person")["transaction"].sum().hist(bins=100)
plt.xlabel("Total monthly amount of money a person has spent")
plt.ylabel("Number of persons");
I noticed that minimum transaction amount is $0.05. I will take a look at them below.
transcript2[transcript2.transaction == 0.05].sample(5)
Little Check of when this too little amount of transactions are made. And I could not find any dependency on time. This should be investigated better by communicating with data engineers and fellows from the marketing department.
# When are the transactions of only $0.05 done
transcript2[transcript2.transaction == 0.05].time.hist()
plt.title("Distribution of types of coupons sent to customers")
plt.xlabel("Coupon type")
plt.ylabel("Frequency of Sending")
These suggests that when the time spent till the coupons are viewed (if they are viewed) distributes in the right-tailed manner. Same thing can be said on the time spent till completed.
Basically, offers are issued on day 0, 7, 14, 17, 21, 24. This means
expires after the end of observation period.
It should be better to remove these offer result later, but if there is a bias in these offers(i.e. Offers for young women are issued on day 24 mainly), the sample for a specific demographic for a specific type of offer would be significantly small. This can be checked more strictly as necessary...
# Distribution of hour of transactions
plt.figure(figsize=(20,20))
plt.subplot(3,1,1)
plt.xlim([0,714])
plt.ylim([0,16000])
transcript2[transcript2.event == "offer received"].time.value_counts().sort_index().plot("bar", color="DarkBlue")
plt.title("Distribution of time when a customers viewed a coupon")
plt.xlabel("Time")
plt.ylabel("Number of coupons viewed")
# Distribution of hours when offer_received
plt.subplot(3,1,2)
plt.ylim([0,16000])
transcript2[transcript2.event == "offer viewed"].time.value_counts().sort_index().plot("bar", color="DarkBlue")
plt.title("Distribution of time when a customers received a coupon")
plt.xlabel("Time")
plt.ylabel("Number of coupons received")
# Distribution of hours when offer_completed
plt.subplot(3,1,3)
plt.ylim([0,16000])
transcript2[transcript2.event == "offer completed"].time.value_counts().sort_index().plot("bar", color="DarkBlue")
plt.title("Distribution of time when a customers completed a coupon")
plt.xlabel("Time")
plt.ylabel("Number of coupons completed");
# persons received no transaction.
transcript2.groupby("person").id.count()[transcript2.groupby("person").id.count() == 0]
I visualize customers in scatter plots based on 3 attributes [gender(male, female, other) vs. age(x-axis), imcome(y-axis)] below.
This suggests that some main clusters of starbucks inclueds
plt.figure(figsize=[20,5])
plt.subplot(1,3,1)
plt.title("Male customers' age vs income")
plt.ylim([25000,125000])
male_profile = profile2[profile2.gender_M == 1]
bins_x = np.arange(15,100,3)
bins_y = np.arange(30000,180000, 5000)
sns.regplot(x=male_profile.age, y=male_profile.income,x_jitter=0.5, y_jitter=500, scatter_kws={"alpha":0.1}, fit_reg=False);
plt.subplot(1,3,2)
plt.title("Male customers' age vs income")
plt.ylim([25000,125000])
male_profile = profile2[profile2.gender_F == 1]
bins_x = np.arange(15,100,3)
bins_y = np.arange(30000,180000, 5000)
sns.regplot(x=male_profile.age, y=male_profile.income,x_jitter=0.5, y_jitter=500, scatter_kws={"alpha":0.1}, fit_reg=False);
plt.subplot(1,3,3)
plt.title("Male customers' age vs income")
plt.ylim([25000,125000])
male_profile = profile2[profile2.gender_O == 1]
bins_x = np.arange(15,100,3)
bins_y = np.arange(30000,180000, 5000)
sns.regplot(x=male_profile.age, y=male_profile.income,x_jitter=0.5, y_jitter=500, scatter_kws={"alpha":0.1}, fit_reg=False);
portfolio2
# extract row of "offer received" only of BOGO or DISCOUNT
offer_rec = transcript2[(transcript2.event == "offer received") & (transcript2.id.str[:4].isin(["bogo","disc"]))]
print(offer_rec.shape)
# remove 10 days offer which issued on day21(hour504)
offer_rec = offer_rec[~ ((offer_rec.time == 504) & (offer_rec.id.isin(["disc_10days$20-$5","disc_10days$10-$2"])))]
print(offer_rec.shape)
# remove 7 days offer which issued on day24
offer_rec = offer_rec[~ ((offer_rec.time == 576) & ~(offer_rec.id.isin(["bogo_7days$10","bogo_7days$5","disc_7days$7-$3","disc_7days$10-$2"])))]
print(offer_rec.shape)
# remove 10 days offer which issued on day24
offer_rec = offer_rec[~ ((offer_rec.time == 576) & ~(offer_rec.id.isin(["disc_10days$20-$5","disc_10days$10-$2"])))]
print(offer_rec.shape)
offer_rec.head()
offer_rec2 = offer_rec[:]
offer_interaction = pd.DataFrame(columns=["person","age","income","became_member_on","offer_id","t_received","t_viewed","amt_till_viewed","t_completed","amt_till_completed"], index=offer_rec2.index)
offer_interaction[["person","t_received","offer_id"]] = offer_rec2[["person","time","id"]]
offer_interaction.head()
def from_id_to_duration(id):
return portfolio2[portfolio2.id == id].duration.iloc[0] * 24
offer_rec2.shape
time to run the cell below
4s for 100recs
42s for 1000 recs
207s for 5000 recs
1976s for 48326 offer rec events
t = time.time()
# elapsed hours till the offer is viewed / completed
viewed_times = []
completed_times = []
# amount of purchase till the offer is viewed
amt_till_viewed = []
amt_till_completed = []
# for each offer received events...
for each_person, each_time, each_id in zip(offer_rec2.person, offer_rec2.time, offer_rec2.id):
this_person_events = transcript2[(transcript2.person == each_person) & (each_time <= transcript2.time) &
(transcript2.time <= each_time + from_id_to_duration(each_id))]
#& (transcript2.id == each_id)
# about offer viewed...
viewed_row = this_person_events[(this_person_events.event == "offer viewed") &
(this_person_events.id == each_id)]
#(each_time <= this_person_events.time) &
#(this_person_events.time <= each_time + from_id_to_duration(each_id))
try:
viewed_time = viewed_row.iloc[0].time
viewed_times.append(viewed_time - each_time)
trans_events = this_person_events[(this_person_events.event == "transaction") &
# (this_person_events.id == each_id)]
# transaction does not have offer_id
#(each_time <= this_person_events.time) &
(this_person_events.time <= viewed_time)]
if (trans_events.shape[0] == 0):
amt_till_viewed.append(0)
else:
amt_till_viewed.append(trans_events.transaction.sum())
except Exception as e:
#print(e)
viewed_times.append(np.nan)
amt_till_viewed.append(np.nan)
# about offer completed....
completed_row = this_person_events[(this_person_events.event == "offer completed") &
(this_person_events.id == each_id)]
#(each_time <= this_person_events.time) &
#(this_person_events.time <= each_time + from_id_to_duration(each_id))
#]
try:
completed_time = completed_row.iloc[0].time
completed_times.append(completed_time - each_time)
trans_events = this_person_events[(this_person_events.event == "transaction") &
#(each_time <= this_person_events.time) &
(this_person_events.time <= completed_time)]
if (trans_events.shape[0] == 0):
amt_till_completed.append(0)
else:
amt_till_completed.append(trans_events.transaction.sum())
except Exception as e:
#print(e)
completed_times.append(np.nan)
amt_till_completed.append(np.nan)
offer_interaction.t_viewed = viewed_times
offer_interaction.amt_till_viewed = amt_till_viewed
offer_interaction.t_completed = completed_times
offer_interaction.amt_till_completed = amt_till_completed
print("{}s for {} offer rec events".format(round(time.time() - t), offer_rec2.shape[0]))
def person_to_attributes(person_id):
return profile2[profile2.id == person_id].iloc[0][["age","gender_F","gender_M","gender_O","income","became_member_on"]]
offer_interaction[["age","gender_F","gender_M","gender_O","income","became_member_on"]] = offer_interaction.person.apply(person_to_attributes)
offer_interaction.became_member_on = pd.to_datetime(offer_interaction.became_member_on, format="%Y%m%d")
##1. Transform the column "became_member_on" according to [this post]
import math
offer_interaction["became_year"] = offer_interaction.became_member_on.dt.year
offer_interaction["became_month_sin"] = (2 * math.pi * offer_interaction.became_member_on.dt.month / 12).apply(lambda x : math.sin(x))
offer_interaction["became_month_cos"] = (2 * math.pi * offer_interaction.became_member_on.dt.month / 12).apply(lambda x : math.cos(x))
offer_interaction["became_day_sin"] = (2 * math.pi * offer_interaction.became_member_on.dt.day / 31).apply(lambda x : math.sin(x))
offer_interaction["became_day_cos"] = (2 * math.pi * offer_interaction.became_member_on.dt.day / 31).apply(lambda x : math.cos(x))
offer_interaction["became_dow_sin"] = (2 * math.pi * offer_interaction.became_member_on.dt.dayofweek / 7).apply(lambda x : math.sin(x))
offer_interaction["became_dow_cos"] = (2 * math.pi * offer_interaction.became_member_on.dt.dayofweek / 7).apply(lambda x : math.cos(x))
## 2. Merge the columns which represents sent offers' characteristics from "portfolio
def from_offer_id_to_series(id):
offer = portfolio2[portfolio2.id == id].iloc[0]
return_s = pd.Series(0, index=["offer_bogo","offer_disc","difficulty","duration","reward",
"email","mobile","social","web"])
for each_c in offer.channels:
return_s[each_c] = 1
if offer.offer_type == "bogo":
return_s.offer_bogo = 1
elif offer.offer_type == "discount":
return_s.offer_disc = 1
return_s.difficulty = offer.difficulty
return_s.duration = offer.duration
return_s.reward = offer.reward
return return_s
offer_interaction[["offer_bogo","offer_disc","difficulty","duration","reward","email","mobile","social","web"]] = \
offer_interaction.offer_id.apply(from_offer_id_to_series)
# "offer_interaction"'s each row represents each offer sent to customers.
print(offer_interaction.shape)
offer_interaction.head()
# save this dataframe in a file.
offer_interaction.to_csv("data/interaction.csv", index=False)
# reload the dataframe
offer_interaction2 = pd.read_csv("data/interaction.csv")
## 3. Extract rows where a customer firstly viewed a coupon.
# remove the offers which are not viewed
offer_viewed = offer_interaction2[~offer_interaction2.t_viewed.isnull()]
# remove the offers which are viewed after completeded (Note that 24 > null ==> null)
offer_viewed = offer_viewed[~(offer_viewed.t_viewed >= offer_viewed.t_completed)]
print(offer_viewed.shape)
offer_viewed.head()
offer_viewed.to_csv("data/firstly_viewed_offers.csv", index=False)