import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
pd.set_option("display.max_rows", None, "display.max_columns", None)
file_path = '/brain_stroke.csv'
strokedata_df = pd.read_csv(file_path)
I will be investigating data sets related to hypertension in the United States. I am interested in seeing correlations between patients having hypertension and it resulting in stroke or heart attack. Sex based differences and lifestyle differences will also be investigated. The website in which the analysis is located can be found HERE (insert a link)
This table is from a CSV file downloaded from Kaggle (link to Kaggle dataset), but the data is orignally from an electronic health record controlled by McKinsey & Company (link to EHR/Healthcare Hackathon Source). This was a part of their healthcare hackathon in 2018 and this dataset is accessible as a free dataset repository. The dataset contains information about 29,072 patients having 12 common variables. The variables are age, gender, marital status, patient identifier, work type, residence type (urban or rural), heart disease condition (binary attribute), body mass index (BMI), smoking status, glucose level, and hypertension status (binary attribute). I will be answering questions such as, “Is there a sex based difference regarding hypertension, heart disease, and stroke?”, “What age groups are affected most by hypertension and stroke prevelence?”, “How frequently does hypertension result in stroke?” and “Is there any correlation between various patient demographics and hypertension prevelence?”
strokedata_df.head()
gender | age | hypertension | heart_disease | ever_married | work_type | Residence_type | avg_glucose_level | bmi | smoking_status | stroke | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Male | 67.0 | 0 | 1 | Yes | Private | Urban | 228.69 | 36.6 | formerly smoked | 1 |
1 | Male | 80.0 | 0 | 1 | Yes | Private | Rural | 105.92 | 32.5 | never smoked | 1 |
2 | Female | 49.0 | 0 | 0 | Yes | Private | Urban | 171.23 | 34.4 | smokes | 1 |
3 | Female | 79.0 | 1 | 0 | Yes | Self-employed | Rural | 174.12 | 24.0 | never smoked | 1 |
4 | Male | 81.0 | 0 | 0 | Yes | Private | Urban | 186.21 | 29.0 | formerly smoked | 1 |
print(strokedata_df.columns)
strokedata_df = strokedata_df.loc[:, ~strokedata_df.columns.isin(["ever_married", "work_type"])]
strokedata_df = strokedata_df.reset_index(drop=True)
#dropping ever married variable and work type variable as they are not needed for what we will be exploring
Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke'], dtype='object')
counts=strokedata_df['stroke'].value_counts()
print(counts)
#248 people in the data set had a stroke (1 indicates stroke, 0 indicates no stroke)
0 4733 1 248 Name: stroke, dtype: int64
hyp_counts=strokedata_df['hypertension'].value_counts()
print(hyp_counts)
#479 people in the data set had hypertension (1 indicates hypertension and 0 indicates no hypertension)
0 4502 1 479 Name: hypertension, dtype: int64
def replace_empty_with_NAN(entry):
entry = str(entry)
if re.search("–", entry) != None:
return np.nan
elif re.search('="0"',entry) != None:
return np.nan
elif re.search('="0.00"', entry) != None:
return np.nan
return entry
for col in strokedata_df.columns:
strokedata_df[col] = strokedata_df[col].apply(replace_empty_with_NAN)
strokedata_df.dtypes
#needs to be formatted
gender object age object hypertension object heart_disease object Residence_type object avg_glucose_level object bmi object smoking_status object stroke object dtype: object
lst_of_col = ["heart_disease", "hypertension", "stroke"]
for col in lst_of_col:
strokedata_df[col] = strokedata_df[col].astype('bool')
gender object age object hypertension bool heart_disease bool Residence_type object avg_glucose_level object bmi object smoking_status object stroke bool dtype: object
lst_of_col = ["avg_glucose_level", "bmi", "age"]
for col in lst_of_col:
strokedata_df[col] = strokedata_df[col].astype('float64')
strokedata_df.dtypes
#changed dtypes for proper formatting
gender object age float64 hypertension bool heart_disease bool Residence_type object avg_glucose_level float64 bmi float64 smoking_status object stroke bool dtype: object
# Create a dictionary to store the counts.
stroke_counts = {}
# Loop through each row in the data.
for index, row in strokedata_df.iterrows():
# Get the gender and stroke values.
gender = row["gender"]
stroke = row["stroke"]
# If the gender is not already in the dictionary, add it.
if gender not in stroke_counts:
stroke_counts[gender] = {}
# If the stroke is not already in the dictionary for the gender, add it.
if stroke not in stroke_counts[gender]:
stroke_counts[gender][stroke] = 0
# Increment the count for the stroke.
stroke_counts[gender][stroke] += 1
# Print the counts.
for gender, strokes in stroke_counts.items():
print(f"{gender}:")
for stroke, count in strokes.items():
print(f"\t{stroke}: {count}")
Male: True: 2074 Female: True: 2907
import matplotlib.pyplot as plt
# Create a dictionary to store the counts.
stroke_counts = {}
# Loop through each row in the data.
for index, row in strokedata_df.iterrows():
# Get the gender and stroke values.
gender = row["gender"]
stroke = row["stroke"]
# If the gender is not already in the dictionary, add it.
if gender not in stroke_counts:
stroke_counts[gender] = {}
# If the stroke is not already in the dictionary for the gender, add it.
if stroke not in stroke_counts[gender]:
stroke_counts[gender][stroke] = 0
# Increment the count for the stroke.
stroke_counts[gender][stroke] += 1
# Plot the counts.
bar_width = 0.35
x_offset = 0
for gender, strokes in stroke_counts.items():
stroke_names = list(strokes.keys())
counts = list(strokes.values())
x = [i + x_offset for i in range(len(stroke_names))]
plt.bar(x, counts, label=gender, width=bar_width)
x_offset += bar_width
# Add labels and title.
plt.xticks([i + (x_offset - bar_width) / 2 for i in range(len(stroke_names))], stroke_names)
plt.xlabel("Stroke Type")
plt.ylabel("Count")
plt.title("Stroke Counts by Gender")
# Add legend and show the plot.
plt.legend()
plt.tight_layout()
plt.show()
#females seem to have more strokes
# Convert 'Age' column to numeric and sort the DataFrame by 'Age'.
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df = df.sort_values(by='Age')
age_values = df['Age'].unique()
grouped_data = df.groupby(['Age', 'Gender'])['Hypertension'].sum().unstack()
grouped_data.plot(kind='bar', stacked=True, alpha=0.5)
# Add the title and labels.
plt.title("Hypertension by Age and Gender")
plt.xlabel("Age")
plt.ylabel("Number of Individuals")
plt.legend(title="Gender")
# Set the x-ticks to sorted age values.
plt.xticks(ticks=range(len(age_values)), labels=age_values, rotation=45) # Rotate x-axis labels for better readability
plt.show()
plt.scatter(df["age"], df["stroke"])
# Add labels and title
plt.xlabel("Age")
plt.ylabel("Stroke")
plt.title("Scatter Plot of Age vs. Stroke")
# Show the plot
plt.show()
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) /usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3801 try: -> 3802 return self._engine.get_loc(casted_key) 3803 except KeyError as err: /usr/local/lib/python3.10/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() /usr/local/lib/python3.10/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'age' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) <ipython-input-70-afe5349c6260> in <cell line: 1>() ----> 1 plt.scatter(df["age"], df["stroke"]) 2 3 # Add labels and title 4 plt.xlabel("Age") 5 plt.ylabel("Stroke") /usr/local/lib/python3.10/dist-packages/pandas/core/frame.py in __getitem__(self, key) 3805 if self.columns.nlevels > 1: 3806 return self._getitem_multilevel(key) -> 3807 indexer = self.columns.get_loc(key) 3808 if is_integer(indexer): 3809 indexer = [indexer] /usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3802 return self._engine.get_loc(casted_key) 3803 except KeyError as err: -> 3804 raise KeyError(key) from err 3805 except TypeError: 3806 # If we have a listlike key, _check_indexing_error will raise KeyError: 'age'
%%shell
jupyter nbconvert --to html '/content/drive/MyDrive/Colab Notebooks/Project Milestone 1.ipynb'
[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab Notebooks/Project Milestone 1.ipynb to html [NbConvertApp] Writing 598658 bytes to /content/drive/MyDrive/Colab Notebooks/Project Milestone 1.html