Data Preprocessing ๐งน
Real data is not clean. ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ๋ ๋ถ์ ๊ฐ๋ฅํ ํํ๋ก ๋ฐ์ดํฐ๋ฅผ ์ ์ ํ๋ ํ์ ๊ณผ์ ์ ๋๋ค.
What is Data Preprocessing?โ
Data preprocessing is the process of transforming raw data into a form suitable for analysis or machine learning.
Main Tasksโ
- ๊ฒฐ์ธก์น ์ฒ๋ฆฌ: ๋๋ฝ๋ ๋ฐ์ดํฐ ๋ค๋ฃจ๊ธฐ
- ์ค๋ณต ์ ๊ฑฐ: ์ค๋ณต๋ ๋ ์ฝ๋ ์ ๊ฑฐ
- ๋ฐ์ดํฐ ํ์ ๋ณํ: ์ ์ ํ ํ์ ์ผ๋ก ๋ณ๊ฒฝ
- ์ด์์น ์ฒ๋ฆฌ: ๋น์ ์์ ์ธ ๊ฐ ์ฒ๋ฆฌ
- ๋ฐ์ดํฐ ํตํฉ: ์ฌ๋ฌ ๋ฐ์ดํฐ ์์ค ๊ฒฐํฉ
Info
๋ฐ์ดํฐ ๊ณผํ์๋ค์ ์ ๋ฌด ์๊ฐ์ 80%๋ฅผ ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ์ ์ฌ์ฉํ๋ค๊ณ ํฉ๋๋ค!
Handling Missing Valuesโ
Checking Missing Valuesโ
import pandas as pd
import numpy as np
# ์ํ ๋ฐ์ดํฐ (๊ฒฐ์ธก์น ํฌํจ)
data = {
'name': ['Alice', 'Bob', 'Charlie', None, 'Eve'],
'age': [25, None, 35, 28, 32],
'salary': [50000, 60000, None, 55000, None],
'department': ['IT', 'HR', 'IT', None, 'HR']
}
df = pd.DataFrame(data)
print("=== ์๋ณธ ๋ฐ์ดํฐ ===")
print(df)
# ๊ฒฐ์ธก์น ํ์ธ
print("\n=== ๊ฒฐ์ธก์น ๊ฐ์ ===")
print(df.isnull().sum())
# ๊ฒฐ์ธก์น ๋น์จ
print("\n=== ๊ฒฐ์ธก์น ๋น์จ ===")
print((df.isnull().sum() / len(df) * 100).round(2))
# ๊ฒฐ์ธก์น๊ฐ ์๋ ํ
print("\n=== ๊ฒฐ์ธก์น ํฌํจ ํ ===")
print(df[df.isnull().any(axis=1)])
Removing Missing Valuesโ
# ๊ฒฐ์ธก์น๊ฐ ์๋ ํ ์ ์ฒด ์ ๊ฑฐ
df_dropped_rows = df.dropna()
print("ํ ์ ๊ฑฐ ํ:", df_dropped_rows.shape)
# ๊ฒฐ์ธก์น๊ฐ ์๋ ์ด ์ ๊ฑฐ
df_dropped_cols = df.dropna(axis=1)
print("์ด ์ ๊ฑฐ ํ:", df_dropped_cols.shape)
# ํน์ ์ด์ ๊ฒฐ์ธก์น๋ง ์ ๊ฑฐ
df_dropped_subset = df.dropna(subset=['name', 'age'])
print("name, age ๊ฒฐ์ธก์น ์ ๊ฑฐ:", df_dropped_subset.shape)
# ๋ชจ๋ ๊ฐ์ด ๊ฒฐ์ธก์น์ธ ํ๋ง ์ ๊ฑฐ
df_dropped_all = df.dropna(how='all')
# ์ต์ N๊ฐ ์ด์์ ๊ฐ์ด ์๋ ํ๋ง ์ ์ง
df_dropped_thresh = df.dropna(thresh=3) # ์ต์ 3๊ฐ ๊ฐ
Filling Missing Valuesโ
# ํน์ ๊ฐ์ผ๋ก ์ฑ์ฐ๊ธฐ
df_filled = df.fillna(0)
df_filled = df.fillna('Unknown')
# ์ด๋ณ๋ก ๋ค๋ฅธ ๊ฐ ์ฑ์ฐ๊ธฐ
df_filled = df.fillna({
'age': df['age'].mean(), # ํ๊ท
'salary': df['salary'].median(), # ์ค์๊ฐ
'department': 'Unknown'
})
# ์/๋ค ๊ฐ์ผ๋ก ์ฑ์ฐ๊ธฐ (์๊ณ์ด ๋ฐ์ดํฐ)
df_ffill = df.fillna(method='ffill') # ์ ๊ฐ์ผ๋ก
df_bfill = df.fillna(method='bfill') # ๋ค ๊ฐ์ผ๋ก
# ๋ณด๊ฐ๋ฒ
df['age'] = df['age'].interpolate() # ์ ํ ๋ณด๊ฐ
Advanced Missing Value Handlingโ
import pandas as pd
import numpy as np
# ์ํ ๋ฐ์ดํฐ
data = {
'date': pd.date_range('2024-01-01', periods=10),
'temperature': [22, 23, None, 25, 26, None, 28, 29, None, 31],
'humidity': [60, 62, 65, None, 68, 70, None, 75, 77, 80]
}
df = pd.DataFrame(data)
# ๊ทธ๋ฃน๋ณ ํ๊ท ์ผ๋ก ์ฑ์ฐ๊ธฐ
df['category'] = ['A', 'A', 'B', 'B', 'A', 'B', 'A', 'B', 'A', 'B']
df['temperature'] = df.groupby('category')['temperature'].transform(
lambda x: x.fillna(x.mean())
)
# ์๊ณ์ด ๋ณด๊ฐ
df['temperature'] = df['temperature'].interpolate(method='time')
# KNN ๊ธฐ๋ฐ ์ฑ์ฐ๊ธฐ (scikit-learn ์ฌ์ฉ)
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)
df[['temperature', 'humidity']] = imputer.fit_transform(
df[['temperature', 'humidity']]
)
Handling Duplicate Dataโ
Checking Duplicatesโ
import pandas as pd
data = {
'name': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob'],
'age': [25, 30, 25, 35, 30],
'city': ['Seoul', 'Busan', 'Seoul', 'Incheon', 'Busan']
}
df = pd.DataFrame(data)
print("=== ์๋ณธ ๋ฐ์ดํฐ ===")
print(df)
# ์ค๋ณต ํ์ธ
print("\n=== ์ค๋ณต ์ฌ๋ถ ===")
print(df.duplicated())
# ์ค๋ณต ๊ฐ์
print(f"\n์ค๋ณต ํ ๊ฐ์: {df.duplicated().sum()}")
# ์ค๋ณต๋ ํ ๋ณด๊ธฐ
print("\n=== ์ค๋ณต๋ ํ ===")
print(df[df.duplicated(keep=False)])
Removing Duplicatesโ
# ์ค๋ณต ์ ๊ฑฐ (์ฒซ ๋ฒ์งธ ์ ์ง)
df_unique = df.drop_duplicates()
print("์ค๋ณต ์ ๊ฑฐ ํ:", len(df_unique))
# ๋ง์ง๋ง ์ ์ง
df_unique_last = df.drop_duplicates(keep='last')
# ๋ชจ๋ ์ ๊ฑฐ
df_no_duplicates = df.drop_duplicates(keep=False)
# ํน์ ์ด ๊ธฐ์ค ์ค๋ณต ์ ๊ฑฐ
df_unique_name = df.drop_duplicates(subset=['name'])
df_unique_multi = df.drop_duplicates(subset=['name', 'age'])
# ์๋ณธ ์์
df.drop_duplicates(inplace=True)
Data Type Conversionโ
Basic Type Conversionโ
import pandas as pd
data = {
'id': ['1', '2', '3', '4', '5'],
'price': ['1000', '1500', '2000', '2500', '3000'],
'date': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'],
'is_active': ['True', 'False', 'True', 'True', 'False']
}
df = pd.DataFrame(data)
print("=== ์๋ณธ ํ์
===")
print(df.dtypes)
# ์ซ์๋ก ๋ณํ
df['id'] = df['id'].astype(int)
df['price'] = df['price'].astype(float)
# ๋ ์ง๋ก ๋ณํ
df['date'] = pd.to_datetime(df['date'])
# ๋ถ๋ฆฌ์ธ์ผ๋ก ๋ณํ
df['is_active'] = df['is_active'].map({'True': True, 'False': False})
print("\n=== ๋ณํ ํ ํ์
===")
print(df.dtypes)
Safe Conversionโ
# ์๋ฌ ๋ฌด์ํ๊ณ ๋ณํ
df['price'] = pd.to_numeric(df['price'], errors='coerce') # ์คํจ ์ NaN
# ๋ ์ง ๋ณํ (์๋ฌ ์ฒ๋ฆฌ)
df['date'] = pd.to_datetime(df['date'], errors='coerce')
# ์นดํ
๊ณ ๋ฆฌ ํ์
(๋ฉ๋ชจ๋ฆฌ ์ ์ฝ)
df['category'] = df['category'].astype('category')
Complex Conversionโ
# ๋ฌธ์์ด์์ ์ซ์ ์ถ์ถ
df['price_str'] = ['$1,000', '$1,500', '$2,000']
df['price'] = df['price_str'].str.replace('[$,]', '', regex=True).astype(float)
# ๋ ์ง ํ์ ์ง์
df['date'] = pd.to_datetime(df['date'], format='%Y/%m/%d')
# ์ฌ๋ฌ ํ์ ์๋
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
String Processingโ
Basic String Operationsโ
import pandas as pd
data = {
'name': [' Alice ', 'BOB', 'charlie', 'David'],
'email': ['alice@example.com', 'BOB@EXAMPLE.COM', 'charlie@test.com', 'david@test.com']
}
df = pd.DataFrame(data)
# ๊ณต๋ฐฑ ์ ๊ฑฐ
df['name'] = df['name'].str.strip()
# ๋์๋ฌธ์ ๋ณํ
df['name_upper'] = df['name'].str.upper()
df['name_lower'] = df['name'].str.lower()
df['name_title'] = df['name'].str.title()
# ์ด๋ฉ์ผ ๋๋ฉ์ธ ์ถ์ถ
df['domain'] = df['email'].str.split('@').str[1]
# ํน์ ๋ฌธ์ ํฌํจ ์ฌ๋ถ
df['has_test'] = df['email'].str.contains('test')
# ๋ฌธ์์ด ๊ธธ์ด
df['name_length'] = df['name'].str.len()
print(df)
Using Regular Expressionsโ
import pandas as pd
data = {
'phone': ['010-1234-5678', '02-987-6543', '031-555-1234', '010.9999.8888'],
'text': ['๊ฐ๊ฒฉ: 10,000์', 'ํ ์ธ๊ฐ 5,000์', '์ ์๊ฐ 20000์', 'ํน๊ฐ 15,000']
}
df = pd.DataFrame(data)
# ์ ํ๋ฒํธ ํ์ ํต์ผ
df['phone_clean'] = df['phone'].str.replace('[.-]', '', regex=True)
# ์ซ์๋ง ์ถ์ถ
df['price'] = df['text'].str.extract(r'(\d+,?\d+)')[0]
df['price'] = df['price'].str.replace(',', '').astype(int)
# ํน์ ํจํด ์ฐพ๊ธฐ
df['has_discount'] = df['text'].str.contains('ํ ์ธ|ํน๊ฐ')
print(df)
Splitting and Joining Textโ
# ๋ฌธ์์ด ๋ถํ
df = pd.DataFrame({'full_name': ['John Doe', 'Jane Smith', 'Bob Johnson']})
df[['first_name', 'last_name']] = df['full_name'].str.split(' ', expand=True)
# ๋ฌธ์์ด ๊ฒฐํฉ
df['full_name_reversed'] = df['last_name'] + ', ' + df['first_name']
# ์ฌ๋ฌ ์ด ๊ฒฐํฉ
df['address'] = df['city'] + ', ' + df['state'] + ' ' + df['zip']
Data Mergingโ
merge (similar to SQL JOIN)โ
import pandas as pd
# ์ง์ ์ ๋ณด
employees = pd.DataFrame({
'emp_id': [1, 2, 3, 4],
'name': ['Alice', 'Bob', 'Charlie', 'David'],
'dept_id': [10, 20, 10, 30]
})
# ๋ถ์ ์ ๋ณด
departments = pd.DataFrame({
'dept_id': [10, 20, 30, 40],
'dept_name': ['IT', 'HR', 'Sales', 'Marketing']
})
# Inner Join (๊ต์งํฉ)
inner = pd.merge(employees, departments, on='dept_id', how='inner')
print("=== Inner Join ===")
print(inner)
# Left Join (์ผ์ชฝ ์ ์ฒด)
left = pd.merge(employees, departments, on='dept_id', how='left')
print("\n=== Left Join ===")
print(left)
# Right Join (์ค๋ฅธ์ชฝ ์ ์ฒด)
right = pd.merge(employees, departments, on='dept_id', how='right')
# Outer Join (ํฉ์งํฉ)
outer = pd.merge(employees, departments, on='dept_id', how='outer')
# ๋ค๋ฅธ ์ด ์ด๋ฆ์ผ๋ก ์กฐ์ธ
df1 = pd.DataFrame({'id': [1, 2], 'value': [10, 20]})
df2 = pd.DataFrame({'key': [1, 2], 'data': [100, 200]})
merged = pd.merge(df1, df2, left_on='id', right_on='key')
concat (simple combination)โ
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})
# ์ธ๋ก ๊ฒฐํฉ (ํ ์ถ๊ฐ)
vertical = pd.concat([df1, df2], ignore_index=True)
print("=== ์ธ๋ก ๊ฒฐํฉ ===")
print(vertical)
# ๊ฐ๋ก ๊ฒฐํฉ (์ด ์ถ๊ฐ)
df3 = pd.DataFrame({'C': [9, 10], 'D': [11, 12]})
horizontal = pd.concat([df1, df3], axis=1)
print("\n=== ๊ฐ๋ก ๊ฒฐํฉ ===")
print(horizontal)
join (index-based)โ
df1 = pd.DataFrame({'A': [1, 2, 3]}, index=['a', 'b', 'c'])
df2 = pd.DataFrame({'B': [4, 5, 6]}, index=['a', 'b', 'd'])
# ์ธ๋ฑ์ค ๊ธฐ์ค ์กฐ์ธ
joined = df1.join(df2, how='outer')
print("=== Join ===")
print(joined)
Grouping (GroupBy)โ
Basic Groupingโ
import pandas as pd
import numpy as np
# ํ๋งค ๋ฐ์ดํฐ
data = {
'date': pd.date_range('2024-01-01', periods=20),
'product': np.random.choice(['A', 'B', 'C'], 20),
'region': np.random.choice(['์์ธ', '๋ถ์ฐ', '๋๊ตฌ'], 20),
'sales': np.random.randint(100, 1000, 20)
}
df = pd.DataFrame(data)
# ์ ํ๋ณ ์ด ๋งค์ถ
product_sales = df.groupby('product')['sales'].sum()
print("=== ์ ํ๋ณ ๋งค์ถ ===")
print(product_sales)
# ์ฌ๋ฌ ์ง๊ณ ํจ์
product_stats = df.groupby('product')['sales'].agg(['sum', 'mean', 'count'])
print("\n=== ์ ํ๋ณ ํต๊ณ ===")
print(product_stats)
# ์ฌ๋ฌ ์ด๋ก ๊ทธ๋ฃนํ
region_product = df.groupby(['region', 'product'])['sales'].sum()
print("\n=== ์ง์ญ๋ณ ์ ํ ๋งค์ถ ===")
print(region_product)
Advanced Groupingโ
# ์ฌ๋ฌ ์ด์ ๋ค๋ฅธ ํจ์ ์ ์ฉ
agg_dict = {
'sales': ['sum', 'mean', 'max'],
'product': 'count'
}
result = df.groupby('region').agg(agg_dict)
print(result)
# ์ปค์คํ
ํจ์
def range_func(x):
return x.max() - x.min()
custom_agg = df.groupby('product')['sales'].agg([
('ํฉ๊ณ', 'sum'),
('ํ๊ท ', 'mean'),
('๋ฒ์', range_func)
])
print(custom_agg)
# ๊ทธ๋ฃน๋ณ ๋ณํ
df['sales_mean_by_product'] = df.groupby('product')['sales'].transform('mean')
df['sales_vs_avg'] = df['sales'] - df['sales_mean_by_product']
# ๊ทธ๋ฃน๋ณ ๋ญํน
df['rank_in_product'] = df.groupby('product')['sales'].rank(ascending=False)
Pivot Tablesโ
# ํผ๋ฒ ํ
์ด๋ธ ์์ฑ
pivot = pd.pivot_table(
df,
values='sales',
index='region',
columns='product',
aggfunc='sum',
fill_value=0
)
print("=== ํผ๋ฒ ํ
์ด๋ธ ===")
print(pivot)
# ์ฌ๋ฌ ์ง๊ณ
pivot_multi = pd.pivot_table(
df,
values='sales',
index='region',
columns='product',
aggfunc=['sum', 'mean'],
fill_value=0
)
Practical Examplesโ
์์ 1: ๊ณ ๊ฐ ๋ฐ์ดํฐ ์ ์ โ
import pandas as pd
import numpy as np
# ์ง์ ๋ถํ ๊ณ ๊ฐ ๋ฐ์ดํฐ
data = {
'customer_id': [1, 2, 3, 4, 5, 5, 6, None, 8],
'name': [' Alice ', 'BOB', 'Charlie', None, 'Eve', 'Eve', 'Frank', 'Grace', 'Henry'],
'email': ['alice@test.com', 'bob@TEST.COM', 'charlie@test', 'dave@test.com',
'eve@test.com', 'eve@test.com', None, 'grace@test.com', 'henry@test.com'],
'age': ['25', '30', 'thirty', '28', '32', '32', '27', '29', None],
'purchase_amount': ['1000', '1,500', '2000', '500', '3000', '3000', '1200', '800', '1500'],
'join_date': ['2024-01-01', '2024/01/15', '2024.02.01', None, '2024-03-01',
'2024-03-01', '2024-03-15', '2024-04-01', '2024-04-15']
}
df = pd.DataFrame(data)
print("=== ์๋ณธ ๋ฐ์ดํฐ (๋ฌธ์ ๋ง์) ===")
print(df)
print(f"\n๋ฐ์ดํฐ ํฌ๊ธฐ: {df.shape}")
# Step 1: ์ค๋ณต ์ ๊ฑฐ
print("\n[1๋จ๊ณ] ์ค๋ณต ์ ๊ฑฐ")
df = df.drop_duplicates(subset=['customer_id', 'email'], keep='first')
print(f"์ค๋ณต ์ ๊ฑฐ ํ: {df.shape}")
# Step 2: ๊ฒฐ์ธก์น ์ฒ๋ฆฌ
print("\n[2๋จ๊ณ] ๊ฒฐ์ธก์น ์ฒ๋ฆฌ")
print(f"๊ฒฐ ์ธก์น ๊ฐ์:\n{df.isnull().sum()}")
# customer_id ๊ฒฐ์ธก์น ์ ๊ฑฐ (ํ์ ํญ๋ชฉ)
df = df.dropna(subset=['customer_id'])
# name ๊ฒฐ์ธก์น ์ฑ์ฐ๊ธฐ
df['name'] = df['name'].fillna('Unknown')
# email ๊ฒฐ์ธก์น ์ฒ๋ฆฌ
df['email'] = df['email'].fillna('no-email@unknown.com')
# Step 3: ๋ฐ์ดํฐ ํ์
๋ณํ
print("\n[3๋จ๊ณ] ๋ฐ์ดํฐ ํ์
๋ณํ")
# customer_id๋ฅผ ์ ์๋ก
df['customer_id'] = df['customer_id'].astype(int)
# age: ์ซ์๊ฐ ์๋ ๊ฐ ์ฒ๋ฆฌ
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['age'] = df['age'].fillna(df['age'].median())
# purchase_amount: ์ผํ ์ ๊ฑฐ ํ ์ซ์๋ก
df['purchase_amount'] = df['purchase_amount'].str.replace(',', '').astype(float)
# join_date: ๋ค์ํ ํ์ ํต์ผ
df['join_date'] = pd.to_datetime(df['join_date'], errors='coerce')
df['join_date'] = df['join_date'].fillna(pd.Timestamp('2024-01-01'))
# Step 4: ๋ฌธ์์ด ์ ์
print("\n[4๋จ๊ณ] ๋ฌธ์์ด ์ ์ ")
# name: ๊ณต๋ฐฑ ์ ๊ฑฐ, ์ฒซ๊ธ์ ๋๋ฌธ์
df['name'] = df['name'].str.strip().str.title()
# email: ์๋ฌธ์๋ก ํต์ผ
df['email'] = df['email'].str.lower()
# Step 5: ์ด๋ฉ์ผ ์ ํจ์ฑ ๊ฒ์ฌ
print("\n[5๋จ๊ณ] ์ด๋ฉ์ผ ์ ํจ์ฑ ๊ฒ์ฌ")
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
df['email_valid'] = df['email'].str.match(email_pattern)
print(f"์ ํจํ์ง ์์ ์ด๋ฉ์ผ: {(~df['email_valid']).sum()}๊ฐ")
# Step 6: ํ์ ๋ณ์ ์์ฑ
print("\n[6๋จ๊ณ] ํ์ ๋ณ์ ์์ฑ")
df['customer_value'] = pd.cut(
df['purchase_amount'],
bins=[0, 1000, 2000, float('inf')],
labels=['Low', 'Medium', 'High']
)
df['days_since_join'] = (pd.Timestamp.now() - df['join_date']).dt.days
print("\n=== ์ ์ ๋ ๋ฐ์ดํฐ ===")
print(df)
print(f"\n์ต์ข
๋ฐ์ดํฐ ํฌ๊ธฐ: {df.shape}")
print(f"\n๋ฐ์ดํฐ ํ์
:\n{df.dtypes}")
# ์ ์ ์์ฝ
print("\n=== ์ ์ ์์ฝ ===")
print(f"- ์ค๋ณต ์ ๊ฑฐ: {len(data['customer_id']) - len(df)}๊ฑด")
print(f"- ๊ฒฐ์ธก์น ์ฒ๋ฆฌ ์๋ฃ")
print(f"- ๋ฐ์ดํฐ ํ์
ํต์ผ")
print(f"- ๋ฌธ์์ด ์ ๊ทํ")
print(f"- ์ด๋ฉ์ผ ๊ฒ์ฆ")
์์ 2: ํ๋งค ๋ฐ์ดํฐ ํตํฉ ๋ฐ ๋ถ์โ
import pandas as pd
import numpy as np
# ์จ๋ผ์ธ ํ๋งค ๋ฐ์ดํฐ
online_sales = pd.DataFrame({
'order_id': [1001, 1002, 1003, 1004],
'customer_id': [101, 102, 101, 103],
'product': ['๋
ธํธ๋ถ', '๋ง์ฐ์ค', 'ํค๋ณด๋', '๋ชจ๋ํฐ'],
'amount': [1500000, 30000, 80000, 400000],
'date': ['2024-01-15', '2024-01-16', '2024-01-17', '2024-01-18'],
'channel': ['์จ๋ผ์ธ'] * 4
})
# ์คํ๋ผ์ธ ํ๋งค ๋ฐ์ดํฐ
offline_sales = pd.DataFrame({
'order_id': [2001, 2002, 2003],
'customer_id': [101, 104, 102],
'product': ['๋ง์ฐ์ค', '๋
ธํธ๋ถ', 'ํค๋ณด๋'],
'amount': [35000, 1600000, 75000],
'date': ['2024-01-16', '2024-01-17', '2024-01-18'],
'channel': ['์คํ๋ผ์ธ'] * 3
})
# ๊ณ ๊ฐ ์ ๋ณด
customers = pd.DataFrame({
'customer_id': [101, 102, 103, 104, 105],
'name': ['๊น์ฒ ์', '์ด์ํฌ', '๋ฐ๋ฏผ์', '์ ์ง์', '์ตํธ์ง'],
'grade': ['VIP', '์ผ๋ฐ', '์ผ๋ฐ', 'VIP', '์ผ๋ฐ'],
'region': ['์์ธ', '๋ถ์ฐ', '๋๊ตฌ', '์์ธ', '์ธ์ฒ']
})
print("=== 1. ๋ฐ์ดํฐ ํตํฉ ===")
# ์จ/์คํ๋ผ์ธ ๋ฐ์ดํฐ ํตํฉ
all_sales = pd.concat([online_sales, offline_sales], ignore_index=True)
print(f"์ ์ฒด ํ๋งค ๊ฑด์: {len(all_sales)}")
# ๋ ์ง ํ์
๋ณํ
all_sales['date'] = pd.to_datetime(all_sales['date'])
# ๊ณ ๊ฐ ์ ๋ณด์ ๋ณํฉ
sales_with_customer = pd.merge(
all_sales,
customers,
on='customer_id',
how='left'
)
print("\n=== 2. ํตํฉ ๋ฐ์ดํฐ ===")
print(sales_with_customer)
print("\n=== 3. ์ฑ๋๋ณ ๋ถ์ ===")
channel_stats = sales_with_customer.groupby('channel').agg({
'amount': ['sum', 'mean', 'count'],
'customer_id': 'nunique'
})
channel_stats.columns = ['์ด๋งค์ถ', 'ํ๊ท ๋งค์ถ', '๊ฑฐ๋๊ฑด์', '๊ณ ๊ฐ์']
print(channel_stats)
print("\n=== 4. ๊ณ ๊ฐ๋ณ ๋ถ์ ===")
customer_stats = sales_with_customer.groupby(['customer_id', 'name', 'grade']).agg({
'amount': 'sum',
'order_id': 'count'
})
customer_stats.columns = ['์ด๊ตฌ๋งค์ก', '๊ตฌ๋งคํ์']
customer_stats = customer_stats.sort_values('์ด๊ตฌ๋งค์ก', ascending=False)
print(customer_stats)
print("\n=== 5. ์ ํ๋ณ ๋ถ์ ===")
product_stats = sales_with_customer.groupby('product').agg({
'amount': ['sum', 'count'],
'customer_id': 'nunique'
})
product_stats.columns = ['์ด๋งค์ถ', 'ํ๋งค๊ฑด์', '๊ตฌ๋งค๊ณ ๊ฐ์']
product_stats = product_stats.sort_values('์ด๋งค์ถ', ascending=False)
print(product_stats)
print("\n=== 6. VIP vs ์ผ๋ฐ ๊ณ ๊ฐ ===")
grade_comparison = sales_with_customer.groupby('grade').agg({
'amount': ['sum', 'mean'],
'order_id': 'count'
})
grade_comparison.columns = ['์ด๋งค์ถ', 'ํ๊ท ๊ตฌ๋งค์ก', '๊ตฌ๋งคํ์']
print(grade_comparison)
print("\n=== 7. ์ง์ญ๋ณ ๋ถ์ ===")
region_sales = sales_with_customer.groupby('region')['amount'].sum().sort_values(ascending=False)
print(region_sales)
# ํผ๋ฒ ํ
์ด๋ธ: ์ง์ญ๋ณ ร ์ฑ๋๋ณ
print("\n=== 8. ์ง์ญ๋ณ ์ฑ๋๋ณ ๋งค์ถ ===")
pivot = pd.pivot_table(
sales_with_customer,
values='amount',
index='region',
columns='channel',
aggfunc='sum',
fill_value=0
)
print(pivot)
์์ 3: ์๊ณ์ด ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌโ
import pandas as pd
import numpy as np
# ๋ถ๊ท์นํ ์๊ณ์ด ๋ฐ์ดํฐ (์ผ์ ๋ฐ์ดํฐ)
dates = pd.date_range('2024-01-01', periods=100, freq='H')
np.random.seed(42)
# ์ผ๋ถ ๋ฐ์ดํฐ ๋๋ฝ ๋ฐ ์ด์์น ํฌํจ
data = {
'timestamp': dates,
'temperature': np.random.normal(20, 5, 100),
'humidity': np.random.normal(60, 10, 100)
}
df = pd.DataFrame(data)
# ๊ฒฐ์ธก์น์ ์ด์์น ์ถ๊ฐ
df.loc[10:15, 'temperature'] = np.nan
df.loc[30, 'temperature'] = 100 # ์ด์์น
df.loc[50:52, 'humidity'] = np.nan
df.loc[70, 'humidity'] = -50 # ์ด์์น
print("=== ์๋ณธ ๋ฐ์ดํฐ ===")
print(f"๋ฐ์ดํฐ ํฌ๊ธฐ: {df.shape}")
print(f"๊ฒฐ์ธก์น:\n{df.isnull().sum()}")
# Step 1: ํ์์คํฌํ๋ฅผ ์ธ๋ฑ์ค๋ก
df.set_index('timestamp', inplace=True)
# Step 2: ์ด์์น ํ์ง ๋ฐ ์ฒ๋ฆฌ (IQR ๋ฐฉ๋ฒ)
print("\n[์ด์์น ์ฒ๋ฆฌ]")
for col in ['temperature', 'humidity']:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = (df[col] < lower_bound) | (df[col] > upper_bound)
print(f"{col} ์ด์์น: {outliers.sum()}๊ฐ")
# ์ด์์น๋ฅผ NaN์ผ๋ก ๋ณ๊ฒฝ
df.loc[outliers, col] = np.nan
# Step 3: ๊ฒฐ์ธก์น ๋ณด๊ฐ
print("\n[๊ฒฐ์ธก์น ๋ณด๊ฐ]")
df['temperature'] = df['temperature'].interpolate(method='time')
df['humidity'] = df['humidity'].interpolate(method='time')
print(f"๋ณด๊ฐ ํ ๊ฒฐ์ธก์น: {df.isnull().sum().sum()}๊ฐ")
# Step 4: ์ด๋ ํ๊ท ์ผ๋ก ๋
ธ์ด์ฆ ์ ๊ฑฐ
df['temp_smooth'] = df['temperature'].rolling(window=3).mean()
df['humid_smooth'] = df['humidity'].rolling(window=3).mean()
# Step 5: ์๊ฐ๋๋ณ ํต๊ณ
df['hour'] = df.index.hour
hourly_stats = df.groupby('hour').agg({
'temperature': ['mean', 'std'],
'humidity': ['mean', 'std']
})
print("\n=== ์๊ฐ๋๋ณ ํต๊ณ ===")
print(hourly_stats.head(10))
# Step 6: ์ผ๋ณ ์ง๊ณ
daily = df.resample('D').agg({
'temperature': ['mean', 'min', 'max'],
'humidity': ['mean', 'min', 'max']
})
print("\n=== ์ผ๋ณ ์ง๊ณ ===")
print(daily)
print("\n=== ์ ์ฒ๋ฆฌ ์๋ฃ ===")
print(f"์ต์ข
๋ฐ์ดํฐ ํฌ๊ธฐ: {df.shape}")
print(f"๊ฒฐ์ธก์น: {df.isnull().sum().sum()}๊ฐ")
Data Preprocessing Pipelineโ
import pandas as pd
class DataCleaningPipeline:
"""๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ํ์ดํ๋ผ์ธ"""
def __init__(self, df):
self.df = df.copy()
self.logs = []
def log(self, message):
"""๋ก๊ทธ ๊ธฐ๋ก"""
self.logs.append(message)
print(f"โ {message}")
def remove_duplicates(self, subset=None):
"""์ค๋ณต ์ ๊ฑฐ"""
before = len(self.df)
self.df = self.df.drop_duplicates(subset=subset)
removed = before - len(self.df)
self.log(f"์ค๋ณต {removed}๊ฑด ์ ๊ฑฐ")
return self
def handle_missing(self, strategy='drop', fill_value=None):
"""๊ฒฐ์ธก์น ์ฒ๋ฆฌ"""
missing_before = self.df.isnull().sum().sum()
if strategy == 'drop':
self.df = self.df.dropna()
elif strategy == 'fill':
self.df = self.df.fillna(fill_value)
elif strategy == 'mean':
numeric_cols = self.df.select_dtypes(include='number').columns
self.df[numeric_cols] = self.df[numeric_cols].fillna(
self.df[numeric_cols].mean()
)
missing_after = self.df.isnull().sum().sum()
self.log(f"๊ฒฐ์ธก์น {missing_before}๊ฐ โ {missing_after}๊ฐ")
return self
def convert_types(self, type_dict):
"""๋ฐ์ดํฐ ํ์
๋ณํ"""
for col, dtype in type_dict.items():
if col in self.df.columns:
self.df[col] = self.df[col].astype(dtype)
self.log(f"{col} โ {dtype} ๋ณํ")
return self
def clean_strings(self, columns):
"""๋ฌธ์์ด ์ ์ """
for col in columns:
if col in self.df.columns:
self.df[col] = (self.df[col]
.str.strip()
.str.lower()
)
self.log(f"{col} ๋ฌธ์์ด ์ ์ ")
return self
def remove_outliers(self, columns, method='iqr'):
"""์ด์์น ์ ๊ฑฐ"""
for col in columns:
if method == 'iqr':
Q1 = self.df[col].quantile(0.25)
Q3 = self.df[col].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
before = len(self.df)
self.df = self.df[(self.df[col] >= lower) & (self.df[col] <= upper)]
removed = before - len(self.df)
self.log(f"{col} ์ด์์น {removed}๊ฑด ์ ๊ฑฐ")
return self
def get_result(self):
"""๊ฒฐ๊ณผ ๋ฐํ"""
print("\n=== ์ ์ฒ๋ฆฌ ์๋ฃ ===")
for log in self.logs:
print(f" {log}")
return self.df
# ์ฌ์ฉ ์์
data = {
'name': [' Alice ', 'BOB', 'Charlie', 'Alice', None],
'age': [25, 30, 200, 25, 28], # 200์ ์ด์์น
'email': ['alice@test.com', 'BOB@TEST.COM', 'charlie@test.com', 'alice@test.com', None]
}
df = pd.DataFrame(data)
# ํ์ดํ๋ผ์ธ ์คํ
cleaned_df = (DataCleaningPipeline(df)
.remove_duplicates(subset=['name', 'email'])
.handle_missing(strategy='drop')
.clean_strings(['name', 'email'])
.remove_outliers(['age'])
.get_result()
)
print("\n์ ์ ๋ ๋ฐ์ดํฐ:")
print(cleaned_df)
Frequently Asked Questionsโ
๊ฒฐ์ธก์น๋ ์ธ์ ์ ๊ฑฐํ๊ณ ์ธ์ ์ฑ์์ผ ํ๋์?โ
์ ๊ฑฐํ๋ ๊ฒฝ์ฐ:
- ๊ฒฐ์ธก์น๊ฐ 5% ๋ฏธ๋ง์ผ ๋
- ๋ฌด์์๋ก ๋ฐ์ํ ๊ฒฐ์ธก์น
- ์ค์ํ์ง ์์ ์ด
์ฑ์ฐ๋ ๊ฒฝ์ฐ:
- ๊ฒฐ์ธก์น๊ฐ ๋ง์ ๋ (20% ์ด์)
- ์๊ณ์ด ๋ฐ์ดํฐ
- ์ค์ํ ํน์ฑ
merge์ join์ ์ฐจ์ด๋?โ
# merge: ๋ช
์์ ์ผ๋ก ํค ์ง์
pd.merge(df1, df2, on='key')
# join: ์ธ๋ฑ์ค ๊ธฐ์ค (๋ ๊ฐ๋จ)
df1.join(df2)
groupby๊ฐ ๋๋ ค์. ์ด๋ป๊ฒ ํด์ผ ํ๋์?โ
# ๋๋ฆผ
df.groupby('category').apply(complex_function)
# ๋น ๋ฆ
df.groupby('category').agg({'column': 'sum'})
# ๋ ๋น ๋ฆ (NumPy ์ฌ์ฉ)
import numpy as np
for cat in df['category'].unique():
subset = df[df['category'] == cat]
result = np.sum(subset['column'])
Next Stepsโ
๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ๋ฅผ ์ตํ๋ค๋ฉด:
- ๊ณ ๊ธ ๋ถ์: ํต๊ณ ๋ถ์, ๊ฐ์ค ๊ฒ ์
- ํน์ง ๊ณตํ: ๋จธ์ ๋ฌ๋์ ์ํ ๋ณ์ ์์ฑ
- ๋ฐ์ดํฐ ์๊ฐํ: ์ ์ฒ๋ฆฌ ๊ฒฐ๊ณผ ์๊ฐํ
- ์๋ํ: ์ ์ฒ๋ฆฌ ํ์ดํ๋ผ์ธ ๊ตฌ์ถ