Project Analisis Data COVID-19 untuk Pemula dengan Python

Project Analisis Data COVID-19 untuk Pemula dengan Python

1. Persiapan Project Analisis Data

1.1 Install Library yang Dibutuhkan

pip install pandas matplotlib seaborn numpy

1.2 Import Library

python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

2. Pengumpulan Data

Kita akan menggunakan data dari Johns Hopkins University Center for Systems Science and Engineering (JHU CSSE) :

python
# URL dataset COVID-19
confirmed_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
deaths_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
recovered_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"

# Load data
confirmed = pd.read_csv(confirmed_url)
deaths = pd.read_csv(deaths_url)
recovered = pd.read_csv(recovered_url)

3. Pemahaman Data (Data Understanding)

3.1 Melihat Struktur Data

python
print("Confirmed Cases Data:")
print(confirmed.head())
print("\nDeaths Data:")
print(deaths.head())
print("\nRecovered Data:")
print(recovered.head())

print("\nConfirmed Data Shape:", confirmed.shape)
print("Deaths Data Shape:", deaths.shape)
print("Recovered Data Shape:", recovered.shape)

3.2 Cek Missing Values

python
print("Missing Values in Confirmed Data:")
print(confirmed.isnull().sum())

print("\nMissing Values in Deaths Data:")
print(deaths.isnull().sum())

print("\nMissing Values in Recovered Data:")
print(recovered.isnull().sum())

4. Data Cleaning

4.1 Mengisi Missing Values

python
# Isi missing values untuk Province/State dengan 'Unknown'
confirmed['Province/State'] = confirmed['Province/State'].fillna('Unknown')
deaths['Province/State'] = deaths['Province/State'].fillna('Unknown')
recovered['Province/State'] = recovered['Province/State'].fillna('Unknown')

# Isi missing values untuk Lat/Long dengan mean
confirmed['Lat'] = confirmed['Lat'].fillna(confirmed['Lat'].mean())
confirmed['Long'] = confirmed['Long'].fillna(confirmed['Long'].mean())

4.2 Transformasi Data

python
# Mengubah format data dari wide ke long
def melt_data(df, value_name):
    melted = df.melt(
        id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'],
        var_name='Date',
        value_name=value_name
    )
    melted['Date'] = pd.to_datetime(melted['Date'])
    return melted

confirmed_long = melt_data(confirmed, 'Confirmed')
deaths_long = melt_data(deaths, 'Deaths')
recovered_long = melt_data(recovered, 'Recovered')

5. Analisis Data

5.1 Kasus Global

python
# Hitung total kasus global per hari
global_confirmed = confirmed_long.groupby('Date')['Confirmed'].sum()
global_deaths = deaths_long.groupby('Date')['Deaths'].sum()
global_recovered = recovered_long.groupby('Date')['Recovered'].sum()

# Plot perkembangan global
plt.figure(figsize=(14, 7))
plt.plot(global_confirmed.index, global_confirmed.values, label='Confirmed', color='blue')
plt.plot(global_deaths.index, global_deaths.values, label='Deaths', color='red')
plt.plot(global_recovered.index, global_recovered.values, label='Recovered', color='green')
plt.title('Global COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Cases')
plt.legend()
plt.grid()
plt.show()

5.2 Analisis Per Negara

python
# Pilih 10 negara dengan kasus tertinggi
top_countries = confirmed_long.groupby('Country/Region')['Confirmed'].max().nlargest(10)

plt.figure(figsize=(12, 6))
top_countries.plot(kind='bar', color='orange')
plt.title('Top 10 Countries by Confirmed Cases')
plt.xlabel('Country')
plt.ylabel('Confirmed Cases')
plt.xticks(rotation=45)
plt.show()

5.3 Analisis Kematian vs Sembuh

python
# Hitung rasio kematian dan kesembuhan
latest_date = confirmed_long['Date'].max()
latest_data = confirmed_long[confirmed_long['Date'] == latest_date].merge(
    deaths_long[deaths_long['Date'] == latest_date], 
    on=['Province/State', 'Country/Region', 'Date']
).merge(
    recovered_long[recovered_long['Date'] == latest_date],
    on=['Province/State', 'Country/Region', 'Date']
)

latest_data['Death Rate'] = (latest_data['Deaths'] / latest_data['Confirmed']) * 100
latest_data['Recovery Rate'] = (latest_data['Recovered'] / latest_data['Confirmed']) * 100

# Filter negara dengan lebih dari 1000 kasus
significant_countries = latest_data[latest_data['Confirmed'] > 1000]

# Plot 10 negara dengan death rate tertinggi
top_death_rates = significant_countries.nlargest(10, 'Death Rate')

plt.figure(figsize=(12, 6))
plt.bar(top_death_rates['Country/Region'], top_death_rates['Death Rate'], color='red')
plt.title('Top 10 Countries by Death Rate (Confirmed Cases > 1000)')
plt.xlabel('Country')
plt.ylabel('Death Rate (%)')
plt.xticks(rotation=45)
plt.show()

6. Visualisasi Lanjutan

6.1 Heatmap Perkembangan Kasus

python
# Buat pivot table untuk heatmap
heatmap_data = confirmed_long.pivot_table(
    index='Country/Region',
    columns='Date',
    values='Confirmed',
    aggfunc='sum'
)

# Ambil 20 negara dengan kasus tertinggi
top_20 = heatmap_data.max(axis=1).nlargest(20).index
heatmap_data = heatmap_data.loc[top_20]

# Plot heatmap
plt.figure(figsize=(16, 10))
sns.heatmap(
    np.log1p(heatmap_data),  # Gunakan log untuk visualisasi yang lebih baik
    cmap='YlOrRd',
    linewidths=0.1
)
plt.title('COVID-19 Confirmed Cases Heatmap (Top 20 Countries)')
plt.xlabel('Date')
plt.ylabel('Country')
plt.show()

6.2 Analisis Harian Baru

python
# Hitung kasus baru harian
global_confirmed_daily = global_confirmed.diff().fillna(0)
global_deaths_daily = global_deaths.diff().fillna(0)
global_recovered_daily = global_recovered.diff().fillna(0)

# Plot kasus baru harian dengan moving average 7 hari
plt.figure(figsize=(14, 7))
plt.plot(global_confirmed_daily.index, global_confirmed_daily.values, 
         label='Daily New Cases', color='blue', alpha=0.3)
plt.plot(global_confirmed_daily.index, global_confirmed_daily.rolling(7).mean(), 
         label='7-Day Moving Average', color='blue', linewidth=2)
plt.title('Global Daily New COVID-19 Cases')
plt.xlabel('Date')
plt.ylabel('New Cases')
plt.legend()
plt.grid()
plt.show()

7. Analisis Spasial (Opsional)

Jika Anda ingin membuat peta, install library berikut :

bash
pip install folium
python
import folium

# Buat peta dunia
world_map = folium.Map(location=[20, 0], zoom_start=2)

# Tambahkan data ke peta
for idx, row in latest_data.iterrows():
    if row['Confirmed'] > 1000:  # Hanya tampilkan negara dengan >1000 kasus
        folium.CircleMarker(
            location=[row['Lat'], row['Long']],
            radius=np.log(row['Confirmed'])*2,  # Ukuran berdasarkan log kasus
            popup=f"{row['Country/Region']}<br>Confirmed: {row['Confirmed']:,}<br>Deaths: {row['Deaths']:,}",
            color='red',
            fill=True,
            fill_color='red'
        ).add_to(world_map)

# Simpan peta
world_map.save('covid19_map.html')

8. Kesimpulan dan Pelaporan

Buat laporan sederhana dari analisis :

python
print("=== COVID-19 ANALYSIS REPORT ===")
print(f"Last Update: {latest_date.strftime('%Y-%m-%d')}")
print(f"Total Global Cases: {global_confirmed.max():,}")
print(f"Total Global Deaths: {global_deaths.max():,}")
print(f"Total Global Recovered: {global_recovered.max():,}")
print(f"\nGlobal Death Rate: {(global_deaths.max()/global_confirmed.max())*100:.2f}%")
print(f"Global Recovery Rate: {(global_recovered.max()/global_confirmed.max())*100:.2f}%")

print("\nTop 5 Countries by Confirmed Cases:")
print(confirmed_long.groupby('Country/Region')['Confirmed'].max().nlargest(5))

Comments

No comments yet. Why don’t you start the discussion?

Leave a Reply

Your email address will not be published. Required fields are marked *