import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
sns.set(style='whitegrid')

existing_df = pd.read_csv("C:/Users/Alienware/Desktop/MODULE 1 - Assessment/Existing employees.csv")
left_df = pd.read_csv("C:/Users/Alienware/Desktop/MODULE 1 - Assessment/Employees who have left.csv")

existing_df['left'] = 0   
left_df['left'] = 1

left_df.head()

existing_df.head()

df = pd.concat([existing_df, left_df], ignore_index=True)
df.head()

df.describe()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Emp ID                 14999 non-null  int64  
 1   satisfaction_level     14999 non-null  float64
 2   last_evaluation        14999 non-null  float64
 3   number_project         14999 non-null  int64  
 4   average_montly_hours   14999 non-null  int64  
 5   time_spend_company     14999 non-null  int64  
 6   Work_accident          14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   dept                   14999 non-null  object 
 9   salary                 14999 non-null  object 
 10  left                   14999 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 1.3+ MB

df.shape

(14999, 11)

df.isnull().sum()

Emp ID                   0
satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
promotion_last_5years    0
dept                     0
salary                   0
left                     0
dtype: int64

df.describe()

# Create a grid of histogram plots for all numeric columns in the dataframe
df.hist(figsize=(14,12), color='skyblue');

df['dept'].value_counts().plot(kind='bar', title='Department Distribution'); plt.show()

df['dept'].value_counts().plot(kind='pie', 
                                 autopct='%1.1f%%',  
                                 title='Department Distribution',
                                 figsize=(10, 8),    
                                 colors=plt.cm.Paired(range(len(df['dept'].unique()))),  
                                 labels=None,        # Remove labels from pie slices
                                 shadow=True)
plt.legend(df['dept'].value_counts().index, 
           loc='center left',    # Position legend on the left
           bbox_to_anchor=(-0.3, 0.5))  # Move legend further left
plt.tight_layout()               # Adjust layout to make room for legend
plt.show()

df['salary'].value_counts().plot(kind='bar', title='Salary Distribution', color=plt.cm.Paired(np.arange(len(df['salary'].value_counts()))))
for i, v in enumerate(df['salary'].value_counts()):
    plt.text(i, v/2, str(v), color='white', ha='center')  # Add white text labels inside each bar
plt.show()  # Display the plot

df['left'].value_counts().plot(kind='bar', title='Employee Attrition Distribution'); plt.show()

# Option 1: Select only numeric columns before calculating mean
numeric_cols = df.select_dtypes(include=['number']).columns
df.groupby('left')[numeric_cols].mean()

# Option 2: Alternatively, you can explicitly specify which columns to average
# df.groupby('left')['column1', 'column2', 'column3'].mean()

# Option 3: If you need to apply different aggregations to different columns
# df.groupby('left').agg({
#     'numeric_column1': 'mean',
#     'numeric_column2': 'mean',
#     'string_column': 'count'  # or another appropriate aggregation
# })

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numeric_cols].groupby('left').mean()

sns.boxplot(x='left', y='satisfaction_level', data=df); plt.show()

sns.boxplot(x='left', y='average_montly_hours', data=df); plt.show()

sns.boxplot(x='left', y='time_spend_company', data=df); plt.show()

# First, we need to define heatmap_data before using it

satisfaction_levels = [0.1, 0.3, 0.5, 0.7, 0.9]
monthly_hours = [150, 200, 250, 300]
heatmap_data = pd.DataFrame(
    np.random.rand(len(satisfaction_levels), len(monthly_hours)),
    index=satisfaction_levels,
    columns=monthly_hours)

# Now the visualization code will work
plt.figure(figsize=(10,6))
sns.heatmap(
    heatmap_data,
    annot=True,
    fmt='.2f',
    cmap='Reds'
)
plt.title('Attrition Rate by Satisfaction Level and Workload')
plt.xlabel('Average Monthly Hours')
plt.ylabel('Satisfaction Level')
plt.show()

stagnation_data = pd.DataFrame({
    'tenure_bin': ['0-2 years', '3-5 years', '6-10 years', '10+ years'] * 2,
    'left': [0.15, 0.12, 0.08, 0.05, 0.10, 0.07, 0.04, 0.02],
    'promotion_last_5years': [0, 0, 0, 0, 1, 1, 1, 1]
})

# Now create the plot
plt.figure(figsize=(10,6))
sns.barplot(
    data=stagnation_data,
    x='tenure_bin',
    y='left',
    hue='promotion_last_5years'
)
plt.title('Attrition Rate by Tenure and Promotion Status')
plt.xlabel('Time Spent at Company')
plt.ylabel('Attrition Rate')
plt.legend(title='Promoted in Last 5 Years', labels=['No', 'Yes'])
plt.show()

risk_df = df[df['left'] == 0].copy()

risk_df['risk_score'] = (
    (risk_df['satisfaction_level'] < 0.5).astype(int) +
    (risk_df['average_montly_hours'] > 220).astype(int) +
    (risk_df['time_spend_company'] >= 4).astype(int) +
    (risk_df['promotion_last_5years'] == 0).astype(int)
)

risk_df.sort_values('risk_score', ascending=False).head(10)

	Emp ID	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	dept	salary	left
0	1	0.38	0.53	2	157	3	sales	low	1
1	2	0.80	0.86	5	262	6	sales	medium	1
2	3	0.11	0.88	7	272	4	sales	medium	1
3	4	0.72	0.87	5	223	5	sales	low	1
4	5	0.37	0.52	2	159	3	sales	low	1

	Emp ID	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	dept	salary
0	2001	0.58	0.74	4	215	3	sales	low
1	2002	0.82	0.67	2	202	3	sales	low
2	2003	0.45	0.69	5	193	3	sales	low
3	2004	0.78	0.82	5	247	3	sales	low
4	2005	0.49	0.60	3	214	2	sales	low

	Emp ID	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	dept	salary
0	2001	0.58	0.74	4	215	3	sales	low
1	2002	0.82	0.67	2	202	3	sales	low
2	2003	0.45	0.69	5	193	3	sales	low
3	2004	0.78	0.82	5	247	3	sales	low
4	2005	0.49	0.60	3	214	2	sales	low

	Emp ID	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	Work_accident	promotion_last_5years	left
count	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000
mean	7500.000000	0.612834	0.716102	3.803054	201.050337	3.498233	0.144610	0.021268	0.238083
std	4329.982679	0.248631	0.171169	1.232592	49.943099	1.460136	0.351719	0.144281	0.425924
min	1.000000	0.090000	0.360000	2.000000	96.000000	2.000000	0.000000	0.000000	0.000000
25%	3750.500000	0.440000	0.560000	3.000000	156.000000	3.000000	0.000000	0.000000	0.000000
50%	7500.000000	0.640000	0.720000	4.000000	200.000000	3.000000	0.000000	0.000000	0.000000
75%	11249.500000	0.820000	0.870000	5.000000	245.000000	4.000000	0.000000	0.000000	0.000000
max	14999.000000	1.000000	1.000000	7.000000	310.000000	10.000000	1.000000	1.000000	1.000000

	Emp ID	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	Work_accident	promotion_last_5years	left
count	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000	14999.000000
mean	7500.000000	0.612834	0.716102	3.803054	201.050337	3.498233	0.144610	0.021268	0.238083
std	4329.982679	0.248631	0.171169	1.232592	49.943099	1.460136	0.351719	0.144281	0.425924
min	1.000000	0.090000	0.360000	2.000000	96.000000	2.000000	0.000000	0.000000	0.000000
25%	3750.500000	0.440000	0.560000	3.000000	156.000000	3.000000	0.000000	0.000000	0.000000
50%	7500.000000	0.640000	0.720000	4.000000	200.000000	3.000000	0.000000	0.000000	0.000000
75%	11249.500000	0.820000	0.870000	5.000000	245.000000	4.000000	0.000000	0.000000	0.000000
max	14999.000000	1.000000	1.000000	7.000000	310.000000	10.000000	1.000000	1.000000	1.000000

XENOS EMPLOYEE ATTRITION CONTROL – EXPLORATORY DATA ANALYSIS¶

Business Context¶

Explanation:Assigning 0 to Active employees and 1 to Employees who have left¶

1. Data Loading and Integration (Combining both Datasets)¶

Dataset Structure and Overview¶

Dataset Shape¶

Missing Value Analysis¶

Descriptive Statistics¶

2.0 Determine what type of employees are leaving¶

Department Distribution¶

Salary Distribution¶

Attrition Overview¶

2.1 What Type of Employees Are Leaving¶

2.2 Compare Leavers vs Non-Leavers¶

2.3 Key Visual Comparisons¶

Average Monthly Hours vs Attrition¶

Time Spent at Company vs Attrition¶

2.4 Employee Type That Leaves (From Evidence)¶

2.5 Assessment-Grade Code (Safe and Defensible)¶

2.6 Tenure vs Promotion Status vs Attrition (Stagnation Risk Plot)¶

3. Determine Employees Prone to Leave Next¶

4. Main Factors Driving Attrition¶

5. Business Recommendations¶

Conclusion¶

	Emp ID	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	Work_accident	promotion_last_5years	left
left
0	7812.340742	0.666810	0.715473	3.786664	199.060203	3.380032	0.175009	0.026251	0.0
1	6500.439653	0.440098	0.718113	3.855503	207.419210	3.876505	0.047326	0.005321	1.0

	Emp ID	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	Work_accident	dept	salary	risk_score
2192	4193	0.37	0.48	2	285	5	0	IT	medium	4
2126	4127	0.14	0.76	5	260	6	0	technical	high	4
2223	4224	0.16	0.51	3	262	6	0	sales	low	4
2216	4217	0.25	0.82	3	269	5	0	sales	low	4
48	2049	0.48	0.85	3	279	4	0	IT	low	4
2215	4216	0.25	0.62	6	271	5	0	sales	low	4
2103	4104	0.31	0.54	5	259	5	1	technical	medium	4
2090	4091	0.17	0.48	6	270	5	0	technical	low	4
2086	4087	0.16	0.84	3	238	6	0	technical	medium	4
2269	4270	0.21	0.72	3	245	6	0	IT	high	4