The Data Journey
From raw data to actionable insights.

The data journey for this project involved several key stages. First, relevant data on KCSE candidature was collected from available sources like the KNEC website and Ministry of Education reports. This raw data was then cleaned and structured to ensure accuracy and consistency.

Once cleaned, the data was organized into tables to facilitate analysis. Finally, visualizations were created to reveal trends, patterns, and disparities within the data.

Sample SQL Scripts
Below are some sample SQL scripts that illustrate how the data was structured and queried to extract insights.
-- 1. Create Database
CREATE DATABASE IF NOT EXISTS kcse_analysis;
USE kcse_analysis;

-- 2. Create Table: Age Group Distribution
CREATE TABLE IF NOT EXISTS county_age_distribution (
    county_code INT,
    county_name VARCHAR(100),
    year INT,
    age_group VARCHAR(20),
    candidate_count INT
);

-- 3. Create Table: Gender Distribution
CREATE TABLE IF NOT EXISTS county_gender_distribution (
    county_code INT,
    county_name VARCHAR(100),
    year INT,
    gender VARCHAR(10),
    candidate_count INT
);

-- 4. Insert Sample Data: Age Group Distribution
INSERT INTO county_age_distribution (county_code, county_name, year, age_group, candidate_count) VALUES
(1, 'Taita Taveta', 2023, '16 Yrs & below', 70),
(1, 'Taita Taveta', 2023, '17 yrs', 1205),
(1, 'Taita Taveta', 2023, '18 yrs', 2579),
(1, 'Taita Taveta', 2023, '19 yrs', 1786),
(1, 'Taita Taveta', 2023, '20 yrs', 847),
(1, 'Taita Taveta', 2023, '21 yrs', 346),
(1, 'Taita Taveta', 2023, '22+ yrs', 213),
(1, 'Taita Taveta', 2023, 'Total', 7046),
(1, 'Taita Taveta', 2022, '16 Yrs & below', 74),
(1, 'Taita Taveta', 2022, '17 yrs', 1154),
(1, 'Taita Taveta', 2022, '18 yrs', 2459),
(1, 'Taita Taveta', 2022, '19 yrs', 1810),
(1, 'Taita Taveta', 2022, '20 yrs', 1059),
(1, 'Taita Taveta', 2022, '21 yrs', 385),
(1, 'Taita Taveta', 2022, '22+ yrs', 283),
(1, 'Taita Taveta', 2022, 'Total', 7224);

-- 5. Insert Sample Data: Gender Distribution
INSERT INTO county_gender_distribution (county_code, county_name, year, gender, candidate_count) VALUES
(1, 'Taita Taveta', 2023, 'Female', 3509),
(1, 'Taita Taveta', 2023, 'Male', 3537),
(1, 'Taita Taveta', 2022, 'Female', 3663),
(1, 'Taita Taveta', 2022, 'Male', 3561);

-- 6. Sample Query: Trend of Female Candidates Over Years
SELECT county_name, year, candidate_count
FROM county_gender_distribution
WHERE gender = 'Female'
ORDER BY year, candidate_count DESC
LIMIT 5;

-- 7. Sample Query: Most Common Age Group Per County (2023)
SELECT county_name, age_group, MAX(candidate_count) AS count_2023
FROM county_age_distribution
WHERE year = 2023
GROUP BY county_name
ORDER BY county_name;

-- 8. Sample Query: Compare Age Distribution Between 2022 and 2023
SELECT county_name, age_group,
    SUM(CASE WHEN year = 2022 THEN candidate_count ELSE 0 END) AS count_2022,
    SUM(CASE WHEN year = 2023 THEN candidate_count ELSE 0 END) AS count_2023
FROM county_age_distribution
GROUP BY county_name, age_group
ORDER BY county_name, age_group;