Getting Started with Pandas 🐼

Pandas is the most powerful and popular library for data analysis in Python. 표 형태의 데이터를 쉽게 다룰 수 있게 해줍니다.

What is Pandas?

Pandas stands for "Panel Data" and is a library for efficiently handling structured data.

Key Features

DataFrame: 표 형태의 데이터 구조
다양한 파일 형식 지원: CSV, Excel, JSON, SQL 등
강력한 데이터 조작: 필터링, 그룹화, 병합
결측치 처리: 누락된 데이터 쉽게 다루기

Installation

pip install pandas

import pandas as pd

# 버전 확인
print(pd.__version__)  # 2.0.3

Core Data Structures

Series (1-dimensional)

Series represents column data.

import pandas as pd

# 리스트에서 생성
s = pd.Series([10, 20, 30, 40, 50])
print(s)
# 0    10
# 1    20
# 2    30
# 3    40
# 4    50
# dtype: int64

# 인덱스 지정
s = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
print(s)
# a    10
# b    20
# c    30
# dtype: int64

# 딕셔너리에서 생성
data = {'Seoul': 9800000, 'Busan': 3400000, 'Incheon': 2900000}
s = pd.Series(data)
print(s)
# Seoul      9800000
# Busan      3400000
# Incheon    2900000
# dtype: int64

# 접근
print(s['Seoul'])      # 9800000
print(s[0])            # 9800000
print(s[['Seoul', 'Busan']])  # 여러 값 선택

DataFrame (2-dimensional)

DataFrame is data in tabular form.

import pandas as pd

# 딕셔너리에서 생성
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [25, 30, 35, 28],
    'city': ['Seoul', 'Busan', 'Incheon', 'Seoul']
}
df = pd.DataFrame(data)
print(df)
#       name  age     city
# 0    Alice   25    Seoul
# 1      Bob   30    Busan
# 2  Charlie   35  Incheon
# 3    David   28    Seoul

# 리스트에서 생성
data = [
    ['Alice', 25, 'Seoul'],
    ['Bob', 30, 'Busan'],
    ['Charlie', 35, 'Incheon']
]
df = pd.DataFrame(data, columns=['name', 'age', 'city'])
print(df)

# CSV에서 생성 (나중에 자세히)
# df = pd.read_csv('data.csv')

DataFrame Basic Information

import pandas as pd

data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'age': [25, 30, 35, 28, 32],
    'salary': [50000, 60000, 75000, 55000, 65000],
    'department': ['IT', 'HR', 'IT', 'Sales', 'HR']
}
df = pd.DataFrame(data)

# 기본 정보
print(df.shape)        # (5, 4) - 행, 열 개수
print(df.columns)      # Index(['name', 'age', 'salary', 'department'])
print(df.index)        # RangeIndex(start=0, stop=5, step=1)
print(df.dtypes)       # 각 열의 데이터 타입

# 처음/마지막 몇 행 보기
print(df.head(3))      # 처음 3행
print(df.tail(2))      # 마지막 2행

# 통계 정보
print(df.info())       # 전체 정보
print(df.describe())   # 수치형 컬럼 통계

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   name        5 non-null      object
 1   age         5 non-null      int64
 2   salary      5 non-null      int64
 3   department  5 non-null      object
dtypes: int64(2), object(2)

Data Selection

Column Selection

# 한 개 열 (Series)
print(df['name'])
# 또는
print(df.name)

# 여러 열 (DataFrame)
print(df[['name', 'age']])

# 열 추가
df['bonus'] = df['salary'] * 0.1
print(df)

Row Selection

# 인덱스로 선택 (iloc)
print(df.iloc[0])          # 첫 번째 행
print(df.iloc[0:3])        # 0~2 행
print(df.iloc[[0, 2, 4]])  # 0, 2, 4 행

# 라벨로 선택 (loc)
df = df.set_index('name')
print(df.loc['Alice'])     # Alice 행
print(df.loc[['Alice', 'Bob']])  # 여러 행

# 특정 행과 열
print(df.iloc[0:2, 0:2])   # 처음 2행, 2열
print(df.loc['Alice', 'age'])  # Alice의 age

Conditional Selection (Filtering)

# 단일 조건
print(df[df['age'] > 30])
print(df[df['department'] == 'IT'])

# 여러 조건
print(df[(df['age'] > 25) & (df['salary'] > 55000)])
print(df[(df['department'] == 'IT') | (df['department'] == 'HR')])

# isin() 사용
print(df[df['department'].isin(['IT', 'Sales'])])

# 문자열 조건
print(df[df['name'].str.startswith('A')])
print(df[df['name'].str.contains('a')])

Reading/Writing Data

CSV Files

import pandas as pd

# CSV 읽기
df = pd.read_csv('employees.csv')

# 옵션 지정
df = pd.read_csv(
    'employees.csv',
    encoding='utf-8',           # 인코딩
    index_col='id',             # 인덱스 열
    usecols=['name', 'age'],    # 특정 열만
    na_values=['NA', 'N/A']     # 결측치로 처리할 값
)

# CSV 쓰기
df.to_csv('output.csv', index=False, encoding='utf-8-sig')

Excel Files

pip install openpyxl

# Excel 읽기
df = pd.read_excel('employees.xlsx', sheet_name='Sheet1')

# 여러 시트 읽기
dfs = pd.read_excel('employees.xlsx', sheet_name=None)
for sheet_name, df in dfs.items():
    print(f"Sheet: {sheet_name}")
    print(df.head())

# Excel 쓰기
df.to_excel('output.xlsx', sheet_name='Data', index=False)

# 여러 시트에 쓰기
with pd.ExcelWriter('output.xlsx') as writer:
    df1.to_excel(writer, sheet_name='Sheet1', index=False)
    df2.to_excel(writer, sheet_name='Sheet2', index=False)

JSON Files

# JSON 읽기
df = pd.read_json('data.json')

# JSON 쓰기
df.to_json('output.json', orient='records', indent=2)

Basic Statistics

import pandas as pd
import numpy as np

data = {
    'product': ['A', 'B', 'C', 'D', 'E'],
    'price': [1000, 1500, 2000, 1200, 1800],
    'quantity': [100, 150, 80, 120, 90]
}
df = pd.DataFrame(data)

# 기본 통계
print(df['price'].mean())      # 평균
print(df['price'].median())    # 중앙값
print(df['price'].std())       # 표준편차
print(df['price'].min())       # 최솟값
print(df['price'].max())       # 최댓값
print(df['price'].sum())       # 합계

# 여러 통계 한 번에
print(df[['price', 'quantity']].describe())

# 상관관계
print(df[['price', 'quantity']].corr())

# 값 개수
print(df['product'].value_counts())

Sorting

# 값으로 정렬
sorted_df = df.sort_values('age')
sorted_df = df.sort_values('age', ascending=False)  # 내림차순

# 여러 열로 정렬
sorted_df = df.sort_values(['department', 'salary'], ascending=[True, False])

# 인덱스로 정렬
sorted_df = df.sort_index()

Practical Examples

예제 1: 판매 데이터 분석

import pandas as pd
import numpy as np

# 샘플 판매 데이터 생성
np.random.seed(42)
dates = pd.date_range('2024-01-01', periods=100)
data = {
    'date': dates,
    'product': np.random.choice(['A', 'B', 'C', 'D'], 100),
    'quantity': np.random.randint(1, 20, 100),
    'price': np.random.choice([1000, 1500, 2000, 2500], 100)
}
df = pd.DataFrame(data)

# 매출 계산
df['revenue'] = df['quantity'] * df['price']

print("=== 판매 데이터 요약 ===")
print(df.head())

# 총 매출
total_revenue = df['revenue'].sum()
print(f"\n총 매출: {total_revenue:,}원")

# 제품별 판매량
product_sales = df.groupby('product')['quantity'].sum().sort_values(ascending=False)
print("\n제품별 판매량:")
print(product_sales)

# 제품별 평균 단가
avg_price = df.groupby('product')['price'].mean()
print("\n제품별 평균 단가:")
print(avg_price)

# 최고 매출 날짜
best_day = df.groupby('date')['revenue'].sum().idxmax()
best_revenue = df.groupby('date')['revenue'].sum().max()
print(f"\n최고 매출 날짜: {best_day.date()} ({best_revenue:,}원)")

# 제품별 매출 순위
product_revenue = df.groupby('product')['revenue'].sum().sort_values(ascending=False)
print("\n제품별 매출 순위:")
for i, (product, revenue) in enumerate(product_revenue.items(), 1):
    print(f"{i}. 제품 {product}: {revenue:,}원")

예제 2: 학생 성적 관리

import pandas as pd

# 학생 성적 데이터
data = {
    'student_id': [1001, 1002, 1003, 1004, 1005],
    'name': ['김철수', '이영희', '박민수', '정지은', '최호진'],
    'math': [85, 92, 78, 95, 88],
    'english': [90, 88, 85, 92, 86],
    'science': [78, 95, 80, 88, 92]
}
df = pd.DataFrame(data)

print("=== 학생 성적표 ===")
print(df)

# 총점과 평균 계산
df['total'] = df[['math', 'english', 'science']].sum(axis=1)
df['average'] = df[['math', 'english', 'science']].mean(axis=1)

# 등수 계산
df['rank'] = df['total'].rank(ascending=False, method='min')

# 정렬
df = df.sort_values('rank')

print("\n=== 성적 결과 ===")
print(df[['name', 'total', 'average', 'rank']])

# 과목별 통계
print("\n=== 과목별 통계 ===")
subjects = ['math', 'english', 'science']
stats = df[subjects].agg(['mean', 'max', 'min', 'std'])
print(stats.round(2))

# 우수 학생 (평균 90점 이상)
excellent = df[df['average'] >= 90]
print(f"\n=== 우수 학생 ({len(excellent)}명) ===")
print(excellent[['name', 'average']])

# 과목별 1등
print("\n=== 과목별 1등 ===")
for subject in subjects:
    top_student = df.loc[df[subject].idxmax()]
    print(f"{subject}: {top_student['name']} ({top_student[subject]}점)")

예제 3: 월별 지출 분석

import pandas as pd

# 지출 데이터
data = {
    'date': pd.to_datetime([
        '2024-01-05', '2024-01-12', '2024-01-20',
        '2024-02-03', '2024-02-15', '2024-02-28',
        '2024-03-08', '2024-03-18', '2024-03-25'
    ]),
    'category': ['식비', '교통', '쇼핑', '식비', '문화', '식비', '교통', '쇼핑', '식비'],
    'amount': [45000, 50000, 120000, 38000, 25000, 52000, 48000, 95000, 41000]
}
df = pd.DataFrame(data)

# 월 정보 추가
df['month'] = df['date'].dt.month
df['month_name'] = df['date'].dt.strftime('%Y-%m')

print("=== 지출 내역 ===")
print(df)

# 총 지출
total = df['amount'].sum()
print(f"\n총 지출: {total:,}원")

# 카테고리별 지출
category_expense = df.groupby('category')['amount'].agg(['sum', 'count', 'mean'])
category_expense.columns = ['총액', '횟수', '평균']
category_expense = category_expense.sort_values('총액', ascending=False)
print("\n=== 카테고리별 지출 ===")
print(category_expense)

# 월별 지출
monthly_expense = df.groupby('month_name')['amount'].sum()
print("\n=== 월별 지출 ===")
for month, amount in monthly_expense.items():
    print(f"{month}: {amount:,}원")

# 가장 큰 지출
max_expense = df.loc[df['amount'].idxmax()]
print(f"\n=== 최대 지출 ===")
print(f"{max_expense['date'].date()} - {max_expense['category']}: {max_expense['amount']:,}원")

# 예산 대비 분석 (월 예산 150,000원)
budget = 150000
monthly_total = df.groupby('month_name')['amount'].sum()
print("\n=== 예산 분석 (월 예산: 150,000원) ===")
for month, amount in monthly_total.items():
    diff = budget - amount
    status = "예산 내" if diff >= 0 else "예산 초과"
    print(f"{month}: {amount:,}원 ({status}, {abs(diff):,}원)")

예제 4: 직원 데이터 분석

import pandas as pd
import numpy as np

# 직원 데이터
data = {
    'employee_id': range(1001, 1021),
    'name': [f'직원{i}' for i in range(1, 21)],
    'department': np.random.choice(['IT', 'HR', 'Sales', 'Marketing'], 20),
    'position': np.random.choice(['사원', '대리', '과장', '차장', '부장'], 20),
    'salary': np.random.randint(3000, 8000, 20) * 1000,
    'years': np.random.randint(1, 15, 20)
}
df = pd.DataFrame(data)

print("=== 직원 현황 ===")
print(df.head(10))

# 부서별 인원수
dept_count = df['department'].value_counts()
print("\n=== 부서별 인원 ===")
print(dept_count)

# 부서별 평균 연봉
dept_salary = df.groupby('department')['salary'].agg(['mean', 'min', 'max'])
dept_salary.columns = ['평균', '최소', '최대']
print("\n=== 부서별 연봉 ===")
print(dept_salary.round())

# 직급별 통계
position_stats = df.groupby('position').agg({
    'salary': 'mean',
    'years': 'mean',
    'employee_id': 'count'
})
position_stats.columns = ['평균연봉', '평균경력', '인원']
print("\n=== 직급별 통계 ===")
print(position_stats.round())

# 경력 구간별 분석
df['experience_level'] = pd.cut(
    df['years'],
    bins=[0, 3, 7, 15],
    labels=['신입', '중급', '고급']
)
exp_salary = df.groupby('experience_level')['salary'].mean()
print("\n=== 경력별 평균 연봉 ===")
print(exp_salary.round())

# 상위 연봉자 5명
top5 = df.nlargest(5, 'salary')[['name', 'department', 'position', 'salary']]
print("\n=== 연봉 Top 5 ===")
print(top5)

Useful Tips

1. 체이닝 (Method Chaining)

# 여러 작업을 연결
result = (df
    .query('age > 25')
    .sort_values('salary', ascending=False)
    .head(10)
    .reset_index(drop=True)
)

2. apply 함수

# 함수 적용
df['salary_category'] = df['salary'].apply(
    lambda x: 'High' if x > 60000 else 'Normal'
)

# 여러 열 사용
df['bonus'] = df.apply(
    lambda row: row['salary'] * 0.2 if row['department'] == 'Sales' else row['salary'] * 0.1,
    axis=1
)

3. 날짜 처리

df['date'] = pd.to_datetime('2024-01-01')
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_name'] = df['date'].dt.day_name()

Frequently Asked Questions

DataFrame과 Excel의 차이는?

Pandas DataFrame:

프로그래밍으로 제어
대용량 데이터 처리 가능
자동화 쉬움
복잡한 연산 가능

Excel:

GUI 기반
작은 데이터에 적합
시각적 편집 용이
수식 입력 직관적

loc와 iloc의 차이는?

df = pd.DataFrame({'A': [1, 2, 3]}, index=['a', 'b', 'c'])

# loc: 라벨 기반
print(df.loc['a'])  # 인덱스 'a'

# iloc: 정수 위치 기반
print(df.iloc[0])   # 첫 번째 행

복사본 vs 뷰?

# 뷰 (원본 영향)
view = df[df['age'] > 30]
view['age'] = 99  # SettingWithCopyWarning

# 복사본 (안전)
copy = df[df['age'] > 30].copy()
copy['age'] = 99  # OK

대용량 데이터 처리는?

# 청크 단위 읽기
for chunk in pd.read_csv('large_file.csv', chunksize=10000):
    process(chunk)

# 특정 열만 읽기
df = pd.read_csv('large_file.csv', usecols=['col1', 'col2'])

# 데이터 타입 지정
df = pd.read_csv('large_file.csv', dtype={'col1': 'int32'})

Next Steps

Pandas 기초를 익혔다면 다음을 학습해보세요:

데이터 전처리: 결측치, 중복, 변환
데이터 병합: merge, join, concat
그룹화: groupby 고급 활용
시계열: 날짜/시간 데이터 처리
시각화: Matplotlib과 함께 사용

What is Pandas?​

Key Features​

Installation​

Core Data Structures​

Series (1-dimensional)​

DataFrame (2-dimensional)​

DataFrame Basic Information​

Data Selection​

Column Selection​

Row Selection​

Conditional Selection (Filtering)​

Reading/Writing Data​

CSV Files​

Excel Files​

JSON Files​

Basic Statistics​

Sorting​

Practical Examples​

예제 1: 판매 데이터 분석​

예제 2: 학생 성적 관리​

예제 3: 월별 지출 분석​

예제 4: 직원 데이터 분석​

Useful Tips​

1. 체이닝 (Method Chaining)​

2. apply 함수​

3. 날짜 처리​

Frequently Asked Questions​

DataFrame과 Excel의 차이는?​

loc와 iloc의 차이는?​

복사본 vs 뷰?​

대용량 데이터 처리는?​

Next Steps​

References​