๐ Exploratory Data Analysis (EDA)
EDA is crucial for understanding your data. The `describe()` method gives you a quick statistical summary.
Mastering this concept will significantly boost your Python data science skills!
๐ป Code Example:
import pandas as pd import numpy as np np.random.seed(42) # Create pynfinity learner dataset df = pd.DataFrame({ "username" : [f"user_{i}" for i in range(100)], "python_score": np.random.randint(40, 100, 100), "ai_score" : np.random.randint(30, 100, 100), "study_hours" : np.round(np.random.uniform(1, 12, 100), 1), "premium" : np.random.choice([True, False], 100, p=[0.3, 0.7]), "course" : np.random.choice(["Python", "AI", "DevOps"], 100), }) # 1. Shape & data types print("Shape:", df.shape) print("\nDtypes:\n", df.dtypes) # 2. Statistical summary print("\nSummary:\n", df.describe()) # 3. Missing values check print("\nNull counts:\n", df.isnull().sum()) # 4. Value counts & distribution print("\nCourse distribution:\n", df["course"].value_counts()) print("\nPremium ratio:", df["premium"].mean().round(2)) # 5. Group by analysis group_stats = df.groupby("course")[["python_score", "ai_score", "study_hours"]].agg( ["mean", "std", "max"] ).round(2) print("\nCourse-level stats:\n", group_stats) # 6. Correlation print("\nCorrelation matrix:") print(df[["python_score", "ai_score", "study_hours"]].corr().round(3)) # 7. Outlier detection (IQR method) Q1 = df["python_score"].quantile(0.25) Q3 = df["python_score"].quantile(0.75) IQR = Q3 - Q1 outliers = df[(df["python_score"] < Q1 - 1.5 * IQR) | (df["python_score"] > Q3 + 1.5 * IQR)] print(f"\nOutliers found: {len(outliers)}")
Keep exploring and happy coding! ๐ป