Statistical Laboratory
Alessandro Ortis - University of Catania
Consider the Mall Customers Data Set. The dataset can be downloaded at the following URL:
data = read.csv("Mall_Customers.csv")
names(data) <- c("CustomerID","Gender","Age","Annual_Income", "Spending_Score")
head(data)
summary(data$Age)
summary(data$Gender)
summary(data$Annual_Income)
summary(data$Spending_Score)
data_f = data[which(data$Gender == 'Female'),]
summary(data_f$Annual_Income)
summary(data_f$Spending_Score)
data_m = data[which(data$Gender == 'Male'),]
summary(data_m$Annual_Income)
summary(data_m$Spending_Score)
plot(x = data$Annual_Income, y=data$Spending_Score, col=c(1:2)[data$Gender])
#data$Gender <- as.factor(data$Gender)
#plot(data$Annual_Income, data$Spending_Score, col=data$Gender)
# Income vs Spending Score
in_spen = data[,c(4,5)]
KM<-kmeans(in_spen,5)
KM
c1 = data[which(KM$cluster == 1),]
head(c1)
plot(data$Annual_Income, data$Spending_Score, col=KM$cluster)
There are five distinct income-spending patterns.
c1 = data[which(KM$cluster == 1),]
c2 = data[which(KM$cluster == 2),]
c3 = data[which(KM$cluster == 3),]
c4 = data[which(KM$cluster == 4),]
c5 = data[which(KM$cluster == 5),]
# Alternatelively, I can extract these numbers from the KM variable
cluster_centroids <- data.frame(
Spending_Score = c(mean(c1$Spending_Score),
mean(c2$Spending_Score),
mean(c3$Spending_Score),
mean(c4$Spending_Score),
mean(c5$Spending_Score)),
Annual_Income = c(mean(c1$Annual_Income),
mean(c2$Annual_Income),
mean(c3$Annual_Income),
mean(c4$Annual_Income),
mean(c5$Annual_Income)))
row.names(cluster_centroids) <- c('C1','C2','C3','C4','C5')
cluster_centroids
Most interesting patterns are those related to people with high spending score and low budget, and viceversa.
# these two clusters may change at every kmeans() execution
hsli = c3 # high_spending_low_income
lshi = c5 # low_spending_high_income
summary(hsli)
summary(lshi)
The median of age is higher for the group of customers who has high incomes and low spending scores. The gender is balanced.
interesting_customers = data.frame(rbind(hsli,lshi))
age_vs_spending = data.frame(Age = interesting_customers$Age,
SpScore = interesting_customers$Spending_Score)
pairs(age_vs_spending, main = "Age/Spending",
pch = 21, bg = c("red", "green")[unclass(interesting_customers$Gender)])
cor(data$Age,data$Spending_Score)
cor(interesting_customers$Age,interesting_customers$Spending_Score)
There is not such a high linear correlation in the entire dataset. But it increases (in abs value) for this 'special' group of customers.
# check the gender... is not that interesting
#cluster = hsli
#counts <- table(cluster$Gender, cluster$Spending_Score)
#barplot(counts,
# main="Spending score vs gender",
# xlab="Spending score",
# col=c("blue","red"),
# legend = rownames(counts))
#cluster = interesting_customers
#counts <- table(cluster$Gender, cluster$Spending_Score)
#barplot(counts,
# main="Spending score vs gender",
# xlab="Spending score",
# col=c("blue","red"),
# legend = rownames(counts))
Are those interesting customers related to their age ?
par(mfrow=c(2,2)) # set the plotting area into a 2*2 array
hist(hsli$Age, freq=FALSE)
lines(density(hsli$Age), col="red")
hist(lshi$Age, freq=FALSE)
lines(density(lshi$Age), col="red")
boxplot(hsli$Age)
boxplot(lshi$Age)
par(mfrow=c(1,2))
boxplot(data$Age)
boxplot(data$Spending_Score)
Both distributions are slightly skewed toward the lower bound.
summary(data$Age)
summary(data$Spending_Score)
age_med = median(data$Age)
sps_med = median(data$Spending_Score)
Now, we may thing that the age can be related to the spending score. Let's check with a X-squared test.
# young low spending
yls = nrow(data[which((interesting_customers$Age<age_med)&(interesting_customers$Spending_Score<sps_med)),])
# young high spending
yhs = nrow(data[which((interesting_customers$Age<age_med)&(interesting_customers$Spending_Score>=sps_med)),])
# old low spending
ols = nrow(data[which((interesting_customers$Age>=age_med)&(interesting_customers$Spending_Score<sps_med)),])
# old high spending
ohs = nrow(data[which((interesting_customers$Age>=age_med)&(interesting_customers$Spending_Score>=sps_med)),])
c(yls,ols,yhs,ohs)
observed_table <- matrix(c(yls, ols,
yhs, ohs), nrow = 2, ncol = 2, byrow = T)
colnames(observed_table) <- c('Young', 'Old')
rownames(observed_table) <- c('LowSpending', 'HghSpending')
observed_table
rowSums(observed_table)
colSums(observed_table)
X <- chisq.test(observed_table)
X
X$expected
From the above result, we can observe that the p-value is less than the significance level (0.05). Hence, we can reject the null hypothesis and conclude that the two variables are not independent.
Therefore, in these two groups there is a high relationship between age and spending score. Indeed, there are no old people in the group of high spending customers. Now, repeat it by considering the median values of the interesting customers only, as we considered the median values computed to the whole dataset.
int_age_med = median(interesting_customers$Age)
int_sps_med = median(interesting_customers$Spending_Score)
# young low spending
yls = nrow(data[which((interesting_customers$Age<int_age_med)&(interesting_customers$Spending_Score<int_sps_med)),])
# young high spending
yhs = nrow(data[which((interesting_customers$Age<int_age_med)&(interesting_customers$Spending_Score>=int_sps_med)),])
# old low spending
ols = nrow(data[which((interesting_customers$Age>=int_age_med)&(interesting_customers$Spending_Score<int_sps_med)),])
# old high spending
ohs = nrow(data[which((interesting_customers$Age>=int_age_med)&(interesting_customers$Spending_Score>=int_sps_med)),])
c(yls,ols,yhs,ohs)
observed_table <- matrix(c(yls, ols,
yhs, ohs), nrow = 2, ncol = 2, byrow = T)
colnames(observed_table) <- c('Young', 'Old')
rownames(observed_table) <- c('LowSpending', 'HghSpending')
observed_table
rowSums(observed_table)
colSums(observed_table)
X <- chisq.test(observed_table)
X
X$expected
In this case the p-value indicates that there is a 0.085% of prob. that the two variables are independent and to observe such statstic value. Such value is still lower a 5% significance level, but is not that low as we are used to observe when data are strongly realated. This may be due to the low number of samples (57), as we know that the X-squared test is affected by the number of samples.
So far, the test suggested that in the two interesting customers groups there may be a relationship between the spending score and the age of the customers. Now, repeat the X-squared test to assess, with more data, if this relationship may be extended to the general customers of the Mall.
# young low spending
yls = nrow(data[which((data$Age<age_med)&(data$Spending_Score<sps_med)),])
# young high spending
yhs = nrow(data[which((data$Age<age_med)&(data$Spending_Score>=sps_med)),])
# old low spending
ols = nrow(data[which((data$Age>=age_med)&(data$Spending_Score<sps_med)),])
# old low spending
ohs = nrow(data[which((data$Age>=age_med)&(data$Spending_Score>=sps_med)),])
c(yls,ols,yhs,ohs)
observed_table <- matrix(c(yls, ols,
yhs, ohs), nrow = 2, ncol = 2, byrow = T)
colnames(observed_table) <- c('Young', 'Old')
rownames(observed_table) <- c('LowSpending', 'HghSpending')
observed_table
rowSums(observed_table)
colSums(observed_table)
X <- chisq.test(observed_table)
X
X$expected
From the above result, we can observe that the p-value is less than the significance level (0.05). Hence, we can reject the null hypothesis and conclude that the two variables are not independent.
To further assess that spending score is not related to the gender, we can repeat the test with the gender counts in the contingency table.
# male low spending
mls = nrow(data[which((data$Gender=='Male')&(data$Spending_Score<sps_med)),])
# male high spending
mhs = nrow(data[which((data$Gender=='Male')&(data$Spending_Score>=sps_med)),])
# female low spending
fls = nrow(data[which((data$Gender=='Female')&(data$Spending_Score<sps_med)),])
# female high spending
fhs = nrow(data[which((data$Gender=='Female')&(data$Spending_Score>=sps_med)),])
c(mls,mhs,fls,fhs)
observed_table <- matrix(c(mls, mhs,
fls, fhs), nrow = 2, ncol = 2, byrow = T)
colnames(observed_table) <- c('Male', 'Female')
rownames(observed_table) <- c('LowSpending', 'HghSpending')
observed_table
rowSums(observed_table)
colSums(observed_table)
X <- chisq.test(observed_table)
X
new_data = data.frame(data, Young =data$Age<median(data$Age) )
plot(x = new_data$Annual_Income, y=new_data$Spending_Score, pch = 21, bg=c("Red", "Blue")[as.integer(new_data$Young)+1])
The plot shows that in the group of low spending and low income the most of people are old. Moreover, the younger customers are mostly distributed in the higher side of spanding score, regardless their incomes.
the p-value is high: >91% to observe the statistic in the case of independence (H0).
Summarize the X-squared test results:
Concluson: the Mall manager may decide to include more services dedicated to old customers, regardless their gender, mantaining what attract young customers.