1. Persiapan Project Analisis Data
1.1 Install Library yang Dibutuhkan
pip install pandas matplotlib seaborn numpy
1.2 Import Library
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from datetime import datetime
2. Pengumpulan Data
Kita akan menggunakan data dari Johns Hopkins University Center for Systems Science and Engineering (JHU CSSE) :
# URL dataset COVID-19 confirmed_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv" deaths_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv" recovered_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv" # Load data confirmed = pd.read_csv(confirmed_url) deaths = pd.read_csv(deaths_url) recovered = pd.read_csv(recovered_url)
3. Pemahaman Data (Data Understanding)
3.1 Melihat Struktur Data
print("Confirmed Cases Data:") print(confirmed.head()) print("\nDeaths Data:") print(deaths.head()) print("\nRecovered Data:") print(recovered.head()) print("\nConfirmed Data Shape:", confirmed.shape) print("Deaths Data Shape:", deaths.shape) print("Recovered Data Shape:", recovered.shape)
3.2 Cek Missing Values
print("Missing Values in Confirmed Data:") print(confirmed.isnull().sum()) print("\nMissing Values in Deaths Data:") print(deaths.isnull().sum()) print("\nMissing Values in Recovered Data:") print(recovered.isnull().sum())
4. Data Cleaning
4.1 Mengisi Missing Values
# Isi missing values untuk Province/State dengan 'Unknown' confirmed['Province/State'] = confirmed['Province/State'].fillna('Unknown') deaths['Province/State'] = deaths['Province/State'].fillna('Unknown') recovered['Province/State'] = recovered['Province/State'].fillna('Unknown') # Isi missing values untuk Lat/Long dengan mean confirmed['Lat'] = confirmed['Lat'].fillna(confirmed['Lat'].mean()) confirmed['Long'] = confirmed['Long'].fillna(confirmed['Long'].mean())
4.2 Transformasi Data
# Mengubah format data dari wide ke long def melt_data(df, value_name): melted = df.melt( id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], var_name='Date', value_name=value_name ) melted['Date'] = pd.to_datetime(melted['Date']) return melted confirmed_long = melt_data(confirmed, 'Confirmed') deaths_long = melt_data(deaths, 'Deaths') recovered_long = melt_data(recovered, 'Recovered')
5. Analisis Data
5.1 Kasus Global
# Hitung total kasus global per hari global_confirmed = confirmed_long.groupby('Date')['Confirmed'].sum() global_deaths = deaths_long.groupby('Date')['Deaths'].sum() global_recovered = recovered_long.groupby('Date')['Recovered'].sum() # Plot perkembangan global plt.figure(figsize=(14, 7)) plt.plot(global_confirmed.index, global_confirmed.values, label='Confirmed', color='blue') plt.plot(global_deaths.index, global_deaths.values, label='Deaths', color='red') plt.plot(global_recovered.index, global_recovered.values, label='Recovered', color='green') plt.title('Global COVID-19 Cases Over Time') plt.xlabel('Date') plt.ylabel('Number of Cases') plt.legend() plt.grid() plt.show()
5.2 Analisis Per Negara
# Pilih 10 negara dengan kasus tertinggi top_countries = confirmed_long.groupby('Country/Region')['Confirmed'].max().nlargest(10) plt.figure(figsize=(12, 6)) top_countries.plot(kind='bar', color='orange') plt.title('Top 10 Countries by Confirmed Cases') plt.xlabel('Country') plt.ylabel('Confirmed Cases') plt.xticks(rotation=45) plt.show()
5.3 Analisis Kematian vs Sembuh
# Hitung rasio kematian dan kesembuhan latest_date = confirmed_long['Date'].max() latest_data = confirmed_long[confirmed_long['Date'] == latest_date].merge( deaths_long[deaths_long['Date'] == latest_date], on=['Province/State', 'Country/Region', 'Date'] ).merge( recovered_long[recovered_long['Date'] == latest_date], on=['Province/State', 'Country/Region', 'Date'] ) latest_data['Death Rate'] = (latest_data['Deaths'] / latest_data['Confirmed']) * 100 latest_data['Recovery Rate'] = (latest_data['Recovered'] / latest_data['Confirmed']) * 100 # Filter negara dengan lebih dari 1000 kasus significant_countries = latest_data[latest_data['Confirmed'] > 1000] # Plot 10 negara dengan death rate tertinggi top_death_rates = significant_countries.nlargest(10, 'Death Rate') plt.figure(figsize=(12, 6)) plt.bar(top_death_rates['Country/Region'], top_death_rates['Death Rate'], color='red') plt.title('Top 10 Countries by Death Rate (Confirmed Cases > 1000)') plt.xlabel('Country') plt.ylabel('Death Rate (%)') plt.xticks(rotation=45) plt.show()
6. Visualisasi Lanjutan
6.1 Heatmap Perkembangan Kasus
# Buat pivot table untuk heatmap heatmap_data = confirmed_long.pivot_table( index='Country/Region', columns='Date', values='Confirmed', aggfunc='sum' ) # Ambil 20 negara dengan kasus tertinggi top_20 = heatmap_data.max(axis=1).nlargest(20).index heatmap_data = heatmap_data.loc[top_20] # Plot heatmap plt.figure(figsize=(16, 10)) sns.heatmap( np.log1p(heatmap_data), # Gunakan log untuk visualisasi yang lebih baik cmap='YlOrRd', linewidths=0.1 ) plt.title('COVID-19 Confirmed Cases Heatmap (Top 20 Countries)') plt.xlabel('Date') plt.ylabel('Country') plt.show()
6.2 Analisis Harian Baru
# Hitung kasus baru harian global_confirmed_daily = global_confirmed.diff().fillna(0) global_deaths_daily = global_deaths.diff().fillna(0) global_recovered_daily = global_recovered.diff().fillna(0) # Plot kasus baru harian dengan moving average 7 hari plt.figure(figsize=(14, 7)) plt.plot(global_confirmed_daily.index, global_confirmed_daily.values, label='Daily New Cases', color='blue', alpha=0.3) plt.plot(global_confirmed_daily.index, global_confirmed_daily.rolling(7).mean(), label='7-Day Moving Average', color='blue', linewidth=2) plt.title('Global Daily New COVID-19 Cases') plt.xlabel('Date') plt.ylabel('New Cases') plt.legend() plt.grid() plt.show()
7. Analisis Spasial (Opsional)
Jika Anda ingin membuat peta, install library berikut :
pip install folium
import folium # Buat peta dunia world_map = folium.Map(location=[20, 0], zoom_start=2) # Tambahkan data ke peta for idx, row in latest_data.iterrows(): if row['Confirmed'] > 1000: # Hanya tampilkan negara dengan >1000 kasus folium.CircleMarker( location=[row['Lat'], row['Long']], radius=np.log(row['Confirmed'])*2, # Ukuran berdasarkan log kasus popup=f"{row['Country/Region']}<br>Confirmed: {row['Confirmed']:,}<br>Deaths: {row['Deaths']:,}", color='red', fill=True, fill_color='red' ).add_to(world_map) # Simpan peta world_map.save('covid19_map.html')
8. Kesimpulan dan Pelaporan
Buat laporan sederhana dari analisis :
print("=== COVID-19 ANALYSIS REPORT ===") print(f"Last Update: {latest_date.strftime('%Y-%m-%d')}") print(f"Total Global Cases: {global_confirmed.max():,}") print(f"Total Global Deaths: {global_deaths.max():,}") print(f"Total Global Recovered: {global_recovered.max():,}") print(f"\nGlobal Death Rate: {(global_deaths.max()/global_confirmed.max())*100:.2f}%") print(f"Global Recovery Rate: {(global_recovered.max()/global_confirmed.max())*100:.2f}%") print("\nTop 5 Countries by Confirmed Cases:") print(confirmed_long.groupby('Country/Region')['Confirmed'].max().nlargest(5))