For the complete Report please go to👉: Click Here

For the Dashboard please go to👉: Click Here

Exploratory data analysis:
What can we get from the data?
Basic analysis:
1. Distribution of complaint reasons and sub-reasons;
2. Analysis of complaint handling results;
3. Distribution of complaint sources;
4. The efficiency of handling urgent complaints;
5. The reason why cannot close the case with in 24H/urge so many time;
6. The impact of gender and membership level on complaints;
7. Region-specific complaint patterns;
8. Compensation amount info;
9. Distribution of the responsibility for complaints;

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# pd.set_option('display.max_columns', None)

#load the dataframe
path = '/Users/houjialyu/Documents/Documents/数据分析/项目/call_center/QA Data Analysis Test.xlsx'
df = pd.read_excel(path, sheet_name='data')

#check the basic info of the df:
print(df.info) #[1042 rows x 15 columns]
print(df.columns)
print(df.head())
print(df.tail())
#data cleaning

#check the duplicate rows:
print(df.duplicated().sum())  #0

#check the null value:
print(df.isnull().sum())  #there are 5 null values in the "subreason" column, fill them with "N"
df.fillna('N', inplace=True)

#check the 0 value:
print(df.values)

#Distribution of complaint reasons and sub-reasons:
re_sub_re = pd.DataFrame()
re_sub_re['count'] = df.groupby(['reason', 'subreason'])['complaintsid'].count()
re_sub_re['pct'] = re_sub_re['count'].apply(lambda x: (x / re_sub_re['count'].sum()) * 100).round(2)
print(re_sub_re)

#Analysis of complaint handling results:
res = df.groupby(['result'])['complaintsid'].count()
print(res.sort_values(ascending=False))

#Distribution of complaint sources
src = df.groupby('source')['complaintsid'].count().reset_index()
src.columns = ['source', 'count']
src['pct'] = src['count'].apply(lambda x: (x / src['count'].sum())*100).round(2)
src.sort_values(by='pct', ascending=False, inplace=True)
plt.bar(
    x=src['source'],
    width=0.5,
    height=src['pct'],
    color='skyblue',
    alpha=0.8
)
for x, y in enumerate(src['pct']):
    plt.text(x, y, str(y) + '%', fontsize=10, ha='center', va='top')
plt.title("Distribution of Complaint Sources")
plt.show()

#the efficiency of handling urgent complaints:
df['urge_count'] = df['complaint_urge_count'].apply(lambda x: '<5' if x < 5 else('<10' if x < 10 else '>=10'))
pivot_t = pd.crosstab([df['isemergency'], df['urge_count']], df['is_closed_in_24h'], margins=True)
pivot_t['pct'] = pivot_t['All'].apply(lambda x: (x / pivot_t.loc['All', 'All']) * 100).round(2)
print(pivot_t)

#The reason why cannot close the case with in 24h/urge more than 5 times
re_sub_re2 = df[df['is_closed_in_24h'] == 'N'].groupby(['reason', 'subreason'])['complaintsid'].count()
re_sub_re2_filtered = re_sub_re2.apply(lambda x: x > 5)
print('The reason why cannot close the case with in 24h:\n {}'.format(re_sub_re2[re_sub_re2_filtered]))
print('--'*40)
re_sub_re3 = df[df['urge_count'] != '<5'].groupby(['reason', 'subreason'])['complaintsid'].count()
print('The reason why urge more than 5 times:\n {}'.format(re_sub_re3))

#The impact of gender and membership level on complaints:
gender = df.groupby(['gender'])['complaintsid'].count().reset_index()
gender.columns = ['gender', 'count']
plt.bar(
    x=gender['gender'],
    width=0.5,
    height=gender['count'],
    color='skyblue',
    alpha=0.8
)
for x, y in enumerate(gender['count']):
    plt.text(x, y, str(y), fontsize=10, ha='center', va='top')
plt.title("Distribution of Clients' Gender")
plt.show()

membership = df.groupby(['membergrade'])['complaintsid'].count().reset_index()
membership.columns = ['membergrade', 'count']
membership['pct'] = membership['count'].apply(lambda x: (x / membership['count'].sum())*100).round(2)
membership.sort_values(by='pct', inplace=True)
plt.barh(
    membership['membergrade'],
    membership['pct'],
    height=0.5,
    color='skyblue',
    alpha=0.8
)
for x, y in enumerate(membership['pct']):
    plt.text(y+7, x, str(y)+"%", fontsize=10, ha='right', va='center')
plt.title("Distribution of Clients' Member_grade")
plt.show()

#Region-specific complaint patterns:
region = df.groupby(['customer region'])['complaintsid'].count().reset_index()
region.columns = ['region', 'count']
region['pct'] = region['count'].apply(lambda x: (x / region['count'].sum())*100).round(2)
region_filtered = region['pct'].apply(lambda x: x > 5)
print(region[region_filtered].sort_values(by='pct', ascending=False))

#Compensation amount info:
compensation = df['compensation_amount_CNY'].describe().round(2)
print(df['compensation_amount_CNY'].sum().round(2))
print(compensation.median())
print(compensation)

#Distribution of the responsibility for complaints
resp = df.groupby(['responsibility'])['complaintsid'].count()
plt.pie(
    resp,
    labels=tuple(resp.index),
    autopct='%.2f%%',
)
plt.show()

Advanced analysis:
1. Relevance of customer area to complaint reason;
2. The correlation between complaint handling results and customer satisfaction;
2.1 The proportion of cases if can be completed within 24H at each membership level /reason+subreason;
2.2 Compensation strategy effect analysis;
3. Time series analysis;
4. Complaints handling team performance evaluation;

#Relevance of customer area to complaint reason
top_regions = df['customer region'].value_counts().index[:5]
top_reasons_data = df[df['customer region'].isin(top_regions)]
pivot_data = pd.crosstab(index=top_reasons_data['reason'], columns=top_reasons_data['customer region'], normalize='columns') * 100
plt.figure(figsize=(10, 8))
sns.heatmap(pivot_data, annot=True, fmt=".1f", linewidths=.5, cmap="YlGnBu")
# plt.title('Complaint Reasons by Customer Region (%)')
plt.ylabel('Reason')
plt.xlabel('Customer Region')
plt.tight_layout()
plt.savefig('/Users/houjialyu/Documents/Documents/数据分析/项目/call_center/reason_region.png', dpi=500)
plt.show()



#The correlation between complaint handling results and customer satisfaction:

#Even though our source data is missing a key indicator, the CSAT(Customer Satifaction),
#it can still be inferred through indirect indicators such as whether complaints are resolved within 24 hours
#and the size of the compensation amount:

fig, axs = plt.subplots(2, 1, figsize=(10, 18))

#The proportion of cases at each membership level that can be completed within 24H:
pivot_table_m = pd.crosstab(df['membergrade'], df['is_closed_in_24h'], normalize='index')*100
pivot_table_m.sort_values(by='N', inplace=True)
pivot_table_m.plot.barh(stacked=True, color=('r', 'skyblue'), alpha=0.8, ax=axs[0])

for i in range(len(pivot_table_m)):
    for j, val in enumerate(pivot_table_m.iloc[i]):
        if val != 0:
            axs[0].text((val/2)+2, i, f'{val:.2f}%', ha='center', va='center', fontsize=10, fontweight='bold')

axs[0].set_title('Pct of if case is_closed_in_24h by member grade')
axs[0].set_xlabel('Percentage')
axs[0].set_ylabel('Member Grade')
axs[0].legend(title='is_closed_in_24h')


#Compensation strategy effect analysis:
#The relationship between the cause of complaint and the amount of compensation
complaint_compensation = df.groupby(['reason'])['compensation_amount_CNY'].mean().round(2)
complaint_compensation.sort_values(ascending=False, inplace=True)
complaint_compensation.plot.bar(ax=axs[1], color='skyblue', alpha=0.8)
for x, y in enumerate(complaint_compensation):
    axs[1].text(x, y/2, y, ha='center', va='center', fontsize=10, fontweight='bold')
axs[1].set_ylabel('AVG of Compensation(CNY)')
axs[1].set_title('AVG Compensation(CNY) caused by different Reasons')

#The relationship between the responsibility of complaint and the amount of compensation
resp_compensation = df.groupby(['responsibility'])['compensation_amount_CNY'].mean().round(2)
resp_compensation.sort_values(ascending=False, inplace=True)
resp_compensation.plot.bar(ax=axs[2], color='skyblue', alpha=0.8)
for x, y in enumerate(resp_compensation):
    axs[2].text(x, y/2, y, ha='center', va='center', fontsize=10, fontweight='bold')
axs[2].set_ylabel('AVG of Compensation(CNY)')
axs[2].set_title('AVG Compensation(CNY) caused by different Responsibility')

plt.tight_layout()
plt.savefig('/Users/houjialyu/Documents/Documents/数据分析/项目/call_center/compensation_strategy.png', dpi=500)
plt.show()


#Compensation strategy effect analysis:
# calculate the mean compensation of the reason and responsibility
avg_compensation = df.groupby(['reason', 'responsibility'])['compensation_amount_CNY'].mean().reset_index()
# chart data prepare
reasons = avg_compensation['reason'].unique()
responsibilities = avg_compensation['responsibility'].unique()
stack_data = pd.DataFrame(index=reasons, columns=responsibilities).fillna(0)

for _, row in avg_compensation.iterrows():
    stack_data.loc[row['reason'], row['responsibility']] = row['compensation_amount_CNY']

total_compensation_by_reason = stack_data.sum(axis=1)
proportions_data = stack_data.divide(total_compensation_by_reason, axis=0)

bottom = pd.Series([0] * len(reasons), index=reasons)
colors = plt.cm.tab20(range(len(responsibilities)))

for responsibility, color in zip(responsibilities, colors):
    proportion = proportions_data[responsibility]
    bars = axs[1].bar(stack_data.index, stack_data[responsibility], bottom=bottom, label=responsibility, color=color)
    bottom += stack_data[responsibility]
    # add values label
    for bar, prop in zip(bars, proportion):
        height = bar.get_height()
        if height > 0:  # only label the bar != 0
            label_x_pos = bar.get_x() + bar.get_width() / 2
            label_y_pos = bar.get_y() + height / 2
            axs[1].text(label_x_pos, label_y_pos, f'{prop:.1%}', ha='center', va='center', color='white', fontsize=10, fontweight='bold')

axs[1].set_title('Average Compensation Amount by Reason and Responsibility with Proportions')
axs[1].set_xlabel('Reason')
axs[1].set_ylabel('Average Compensation Amount (CNY)')
axs[1].legend(title='Responsibility')
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig('/Users/houjialyu/Documents/Documents/数据分析/项目/call_center/compensation_strategy.png', dpi=500)
plt.show()




# #Complaints handling team performance evaluation:
df['is_closed_in_24h_numeric'] = df['is_closed_in_24h'].apply(lambda x: 1 if x == 'Y' else 0)
closure_rate_within_24h_by_team = df.groupby(['Team'])['is_closed_in_24h_numeric'].mean() * 100

compensation_cases_proportion_by_team = (df[df['compensation_amount_CNY'] > 0].groupby('Team').size() / df.groupby('Team').size()) * 100

team_performance_metrics = pd.DataFrame({
    'Team': closure_rate_within_24h_by_team.index,
    'Closure Rate Within 24h (%)': closure_rate_within_24h_by_team.values,
    'Compensation Cases Proportion (%)': compensation_cases_proportion_by_team.values
})
# print(team_performance_metrics)

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
sns.barplot(x='Closure Rate Within 24h (%)', y='Team', data=team_performance_metrics, palette='coolwarm')
for x, y in enumerate(team_performance_metrics['Closure Rate Within 24h (%)']):
    plt.text(y/2, x, f'{y:.2f}%', ha='center', va='center', fontsize=10, fontweight='bold')
plt.title('Closure Rate Within 24h by Team')

plt.subplot(1, 2, 2)
sns.barplot(x='Compensation Cases Proportion (%)', y='Team', data=team_performance_metrics, palette='coolwarm')
for x, y in enumerate(team_performance_metrics['Compensation Cases Proportion (%)']):
    plt.text(y/2, x, f'{y:.2f}%', ha='center', va='center', fontsize=10, fontweight='bold')
plt.title('Compensation Cases Proportion by Team')

plt.tight_layout()
plt.savefig('/Users/houjialyu/Documents/Documents/数据分析/项目/call_center/teams.png', dpi=500)
plt.show()