import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
pd.set_option("display.max_rows", None, "display.max_columns", None)


file_path = '/brain_stroke.csv'
strokedata_df = pd.read_csv(file_path)


strokedata_df.head()


print(strokedata_df.columns)
strokedata_df = strokedata_df.loc[:, ~strokedata_df.columns.isin(["ever_married", "work_type"])]
strokedata_df = strokedata_df.reset_index(drop=True)

#dropping ever married variable and work type variable as they are not needed for what we will be exploring

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')


counts=strokedata_df['stroke'].value_counts()
print(counts)
#248 people in the data set had a stroke (1 indicates stroke, 0 indicates no stroke)

0    4733
1     248
Name: stroke, dtype: int64


hyp_counts=strokedata_df['hypertension'].value_counts()
print(hyp_counts)
#479 people in the data set had hypertension (1 indicates hypertension and 0 indicates no hypertension)

0    4502
1     479
Name: hypertension, dtype: int64


def replace_empty_with_NAN(entry):
    entry = str(entry)
    if re.search("–", entry) != None:
        return np.nan
    elif re.search('="0"',entry) != None:
        return np.nan
    elif re.search('="0.00"', entry) != None:
        return np.nan
    return entry


for col in strokedata_df.columns:
    strokedata_df[col] = strokedata_df[col].apply(replace_empty_with_NAN)


strokedata_df.dtypes
#needs to be formatted

gender               object
age                  object
hypertension         object
heart_disease        object
Residence_type       object
avg_glucose_level    object
bmi                  object
smoking_status       object
stroke               object
dtype: object


lst_of_col = ["heart_disease", "hypertension", "stroke"]

for col in lst_of_col:
    strokedata_df[col] = strokedata_df[col].astype('bool')

gender               object
age                  object
hypertension           bool
heart_disease          bool
Residence_type       object
avg_glucose_level    object
bmi                  object
smoking_status       object
stroke                 bool
dtype: object


lst_of_col = ["avg_glucose_level", "bmi", "age"]

for col in lst_of_col:
    strokedata_df[col] = strokedata_df[col].astype('float64')
strokedata_df.dtypes

#changed dtypes for proper formatting

gender                object
age                  float64
hypertension            bool
heart_disease           bool
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                  bool
dtype: object


# Create a dictionary to store the counts.
stroke_counts = {}

# Loop through each row in the data.
for index, row in strokedata_df.iterrows():
    # Get the gender and stroke values.
    gender = row["gender"]
    stroke = row["stroke"]

    # If the gender is not already in the dictionary, add it.
    if gender not in stroke_counts:
        stroke_counts[gender] = {}

    # If the stroke is not already in the dictionary for the gender, add it.
    if stroke not in stroke_counts[gender]:
        stroke_counts[gender][stroke] = 0

    # Increment the count for the stroke.
    stroke_counts[gender][stroke] += 1

# Print the counts.
for gender, strokes in stroke_counts.items():
    print(f"{gender}:")
    for stroke, count in strokes.items():
        print(f"\t{stroke}: {count}")

Male:
	True: 2074
Female:
	True: 2907


import matplotlib.pyplot as plt

# Create a dictionary to store the counts.
stroke_counts = {}

# Loop through each row in the data.
for index, row in strokedata_df.iterrows():
    # Get the gender and stroke values.
    gender = row["gender"]
    stroke = row["stroke"]

    # If the gender is not already in the dictionary, add it.
    if gender not in stroke_counts:
        stroke_counts[gender] = {}

    # If the stroke is not already in the dictionary for the gender, add it.
    if stroke not in stroke_counts[gender]:
        stroke_counts[gender][stroke] = 0

    # Increment the count for the stroke.
    stroke_counts[gender][stroke] += 1

# Plot the counts.
bar_width = 0.35
x_offset = 0

for gender, strokes in stroke_counts.items():
    stroke_names = list(strokes.keys())
    counts = list(strokes.values())

    x = [i + x_offset for i in range(len(stroke_names))]
    plt.bar(x, counts, label=gender, width=bar_width)
    x_offset += bar_width

# Add labels and title.
plt.xticks([i + (x_offset - bar_width) / 2 for i in range(len(stroke_names))], stroke_names)
plt.xlabel("Stroke Type")
plt.ylabel("Count")
plt.title("Stroke Counts by Gender")

# Add legend and show the plot.
plt.legend()
plt.tight_layout()
plt.show()

#females seem to have more strokes


# Convert 'Age' column to numeric and sort the DataFrame by 'Age'.
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df = df.sort_values(by='Age')

age_values = df['Age'].unique()
grouped_data = df.groupby(['Age', 'Gender'])['Hypertension'].sum().unstack()

grouped_data.plot(kind='bar', stacked=True, alpha=0.5)

# Add the title and labels.
plt.title("Hypertension by Age and Gender")
plt.xlabel("Age")
plt.ylabel("Number of Individuals")
plt.legend(title="Gender")

# Set the x-ticks to sorted age values.
plt.xticks(ticks=range(len(age_values)), labels=age_values, rotation=45)  # Rotate x-axis labels for better readability

plt.show()


plt.scatter(df["age"], df["stroke"])

# Add labels and title
plt.xlabel("Age")
plt.ylabel("Stroke")
plt.title("Scatter Plot of Age vs. Stroke")

# Show the plot
plt.show()

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3801             try:
-> 3802                 return self._engine.get_loc(casted_key)
   3803             except KeyError as err:

/usr/local/lib/python3.10/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

/usr/local/lib/python3.10/dist-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'age'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
<ipython-input-70-afe5349c6260> in <cell line: 1>()
----> 1 plt.scatter(df["age"], df["stroke"])
      2 
      3 # Add labels and title
      4 plt.xlabel("Age")
      5 plt.ylabel("Stroke")

/usr/local/lib/python3.10/dist-packages/pandas/core/frame.py in __getitem__(self, key)
   3805             if self.columns.nlevels > 1:
   3806                 return self._getitem_multilevel(key)
-> 3807             indexer = self.columns.get_loc(key)
   3808             if is_integer(indexer):
   3809                 indexer = [indexer]

/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3802                 return self._engine.get_loc(casted_key)
   3803             except KeyError as err:
-> 3804                 raise KeyError(key) from err
   3805             except TypeError:
   3806                 # If we have a listlike key, _check_indexing_error will raise

KeyError: 'age'


%%shell

jupyter nbconvert --to html '/content/drive/MyDrive/Colab Notebooks/Project Milestone 1.ipynb'

[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab Notebooks/Project Milestone 1.ipynb to html
[NbConvertApp] Writing 598658 bytes to /content/drive/MyDrive/Colab Notebooks/Project Milestone 1.html

	gender	age	hypertension	heart_disease	ever_married	work_type	Residence_type	avg_glucose_level	bmi	smoking_status	stroke
0	Male	67.0	0	1	Yes	Private	Urban	228.69	36.6	formerly smoked	1
1	Male	80.0	0	1	Yes	Private	Rural	105.92	32.5	never smoked	1
2	Female	49.0	0	0	Yes	Private	Urban	171.23	34.4	smokes	1
3	Female	79.0	1	0	Yes	Self-employed	Rural	174.12	24.0	never smoked	1
4	Male	81.0	0	0	Yes	Private	Urban	186.21	29.0	formerly smoked	1

Project Goals¶

Table 1: Stroke Data¶