In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
pd.set_option("display.max_rows", None, "display.max_columns", None)
In [8]:
file_path = '/brain_stroke.csv'
strokedata_df = pd.read_csv(file_path)

Project Goals¶

I will be investigating data sets related to hypertension in the United States. I am interested in seeing correlations between patients having hypertension and it resulting in stroke or heart attack. Sex based differences and lifestyle differences will also be investigated. The website in which the analysis is located can be found HERE (insert a link)

Table 1: Stroke Data¶

This table is from a CSV file downloaded from Kaggle (link to Kaggle dataset), but the data is orignally from an electronic health record controlled by McKinsey & Company (link to EHR/Healthcare Hackathon Source). This was a part of their healthcare hackathon in 2018 and this dataset is accessible as a free dataset repository. The dataset contains information about 29,072 patients having 12 common variables. The variables are age, gender, marital status, patient identifier, work type, residence type (urban or rural), heart disease condition (binary attribute), body mass index (BMI), smoking status, glucose level, and hypertension status (binary attribute). I will be answering questions such as, “Is there a sex based difference regarding hypertension, heart disease, and stroke?”, “What age groups are affected most by hypertension and stroke prevelence?”, “How frequently does hypertension result in stroke?” and “Is there any correlation between various patient demographics and hypertension prevelence?”

In [9]:
strokedata_df.head()
Out[9]:
gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 Male 67.0 0 1 Yes Private Urban 228.69 36.6 formerly smoked 1
1 Male 80.0 0 1 Yes Private Rural 105.92 32.5 never smoked 1
2 Female 49.0 0 0 Yes Private Urban 171.23 34.4 smokes 1
3 Female 79.0 1 0 Yes Self-employed Rural 174.12 24.0 never smoked 1
4 Male 81.0 0 0 Yes Private Urban 186.21 29.0 formerly smoked 1
In [10]:
print(strokedata_df.columns)
strokedata_df = strokedata_df.loc[:, ~strokedata_df.columns.isin(["ever_married", "work_type"])]
strokedata_df = strokedata_df.reset_index(drop=True)

#dropping ever married variable and work type variable as they are not needed for what we will be exploring
Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')
In [11]:
counts=strokedata_df['stroke'].value_counts()
print(counts)
#248 people in the data set had a stroke (1 indicates stroke, 0 indicates no stroke)
0    4733
1     248
Name: stroke, dtype: int64
In [12]:
hyp_counts=strokedata_df['hypertension'].value_counts()
print(hyp_counts)
#479 people in the data set had hypertension (1 indicates hypertension and 0 indicates no hypertension)
0    4502
1     479
Name: hypertension, dtype: int64
In [13]:
def replace_empty_with_NAN(entry):
    entry = str(entry)
    if re.search("–", entry) != None:
        return np.nan
    elif re.search('="0"',entry) != None:
        return np.nan
    elif re.search('="0.00"', entry) != None:
        return np.nan
    return entry
In [14]:
for col in strokedata_df.columns:
    strokedata_df[col] = strokedata_df[col].apply(replace_empty_with_NAN)
In [15]:
strokedata_df.dtypes
#needs to be formatted
Out[15]:
gender               object
age                  object
hypertension         object
heart_disease        object
Residence_type       object
avg_glucose_level    object
bmi                  object
smoking_status       object
stroke               object
dtype: object
In [20]:
lst_of_col = ["heart_disease", "hypertension", "stroke"]

for col in lst_of_col:
    strokedata_df[col] = strokedata_df[col].astype('bool')
Out[20]:
gender               object
age                  object
hypertension           bool
heart_disease          bool
Residence_type       object
avg_glucose_level    object
bmi                  object
smoking_status       object
stroke                 bool
dtype: object
In [22]:
lst_of_col = ["avg_glucose_level", "bmi", "age"]

for col in lst_of_col:
    strokedata_df[col] = strokedata_df[col].astype('float64')
strokedata_df.dtypes

#changed dtypes for proper formatting
Out[22]:
gender                object
age                  float64
hypertension            bool
heart_disease           bool
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                  bool
dtype: object
In [26]:
# Create a dictionary to store the counts.
stroke_counts = {}

# Loop through each row in the data.
for index, row in strokedata_df.iterrows():
    # Get the gender and stroke values.
    gender = row["gender"]
    stroke = row["stroke"]

    # If the gender is not already in the dictionary, add it.
    if gender not in stroke_counts:
        stroke_counts[gender] = {}

    # If the stroke is not already in the dictionary for the gender, add it.
    if stroke not in stroke_counts[gender]:
        stroke_counts[gender][stroke] = 0

    # Increment the count for the stroke.
    stroke_counts[gender][stroke] += 1

# Print the counts.
for gender, strokes in stroke_counts.items():
    print(f"{gender}:")
    for stroke, count in strokes.items():
        print(f"\t{stroke}: {count}")
Male:
	True: 2074
Female:
	True: 2907
In [47]:
import matplotlib.pyplot as plt

# Create a dictionary to store the counts.
stroke_counts = {}

# Loop through each row in the data.
for index, row in strokedata_df.iterrows():
    # Get the gender and stroke values.
    gender = row["gender"]
    stroke = row["stroke"]

    # If the gender is not already in the dictionary, add it.
    if gender not in stroke_counts:
        stroke_counts[gender] = {}

    # If the stroke is not already in the dictionary for the gender, add it.
    if stroke not in stroke_counts[gender]:
        stroke_counts[gender][stroke] = 0

    # Increment the count for the stroke.
    stroke_counts[gender][stroke] += 1

# Plot the counts.
bar_width = 0.35
x_offset = 0

for gender, strokes in stroke_counts.items():
    stroke_names = list(strokes.keys())
    counts = list(strokes.values())

    x = [i + x_offset for i in range(len(stroke_names))]
    plt.bar(x, counts, label=gender, width=bar_width)
    x_offset += bar_width

# Add labels and title.
plt.xticks([i + (x_offset - bar_width) / 2 for i in range(len(stroke_names))], stroke_names)
plt.xlabel("Stroke Type")
plt.ylabel("Count")
plt.title("Stroke Counts by Gender")

# Add legend and show the plot.
plt.legend()
plt.tight_layout()
plt.show()

#females seem to have more strokes
In [62]:
# Convert 'Age' column to numeric and sort the DataFrame by 'Age'.
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df = df.sort_values(by='Age')

age_values = df['Age'].unique()
grouped_data = df.groupby(['Age', 'Gender'])['Hypertension'].sum().unstack()

grouped_data.plot(kind='bar', stacked=True, alpha=0.5)

# Add the title and labels.
plt.title("Hypertension by Age and Gender")
plt.xlabel("Age")
plt.ylabel("Number of Individuals")
plt.legend(title="Gender")

# Set the x-ticks to sorted age values.
plt.xticks(ticks=range(len(age_values)), labels=age_values, rotation=45)  # Rotate x-axis labels for better readability

plt.show()
In [70]:
plt.scatter(df["age"], df["stroke"])

# Add labels and title
plt.xlabel("Age")
plt.ylabel("Stroke")
plt.title("Scatter Plot of Age vs. Stroke")

# Show the plot
plt.show()
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3801             try:
-> 3802                 return self._engine.get_loc(casted_key)
   3803             except KeyError as err:

/usr/local/lib/python3.10/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

/usr/local/lib/python3.10/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'age'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
<ipython-input-70-afe5349c6260> in <cell line: 1>()
----> 1 plt.scatter(df["age"], df["stroke"])
      2 
      3 # Add labels and title
      4 plt.xlabel("Age")
      5 plt.ylabel("Stroke")

/usr/local/lib/python3.10/dist-packages/pandas/core/frame.py in __getitem__(self, key)
   3805             if self.columns.nlevels > 1:
   3806                 return self._getitem_multilevel(key)
-> 3807             indexer = self.columns.get_loc(key)
   3808             if is_integer(indexer):
   3809                 indexer = [indexer]

/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3802                 return self._engine.get_loc(casted_key)
   3803             except KeyError as err:
-> 3804                 raise KeyError(key) from err
   3805             except TypeError:
   3806                 # If we have a listlike key, _check_indexing_error will raise

KeyError: 'age'
In [16]:
%%shell

jupyter nbconvert --to html '/content/drive/MyDrive/Colab Notebooks/Project Milestone 1.ipynb'
[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab Notebooks/Project Milestone 1.ipynb to html
[NbConvertApp] Writing 598658 bytes to /content/drive/MyDrive/Colab Notebooks/Project Milestone 1.html
Out[16]: