제주대 데이터분석 2Day / 3Day
2022년 6월 데이터분석을 위한 기초문법
- pandas
- 공식 홈페이지 튜토리얼
- What kind of data does pandas handle?
- Do something with a DataFrame or Series
- How do I read and write tabular data?
- How do I select a subset of a DataFrame?
- How do I filter specific rows from a DataFrame?
- How do I select specific rows and columns from a DataFrame?
- 크롤링 데이터로 웹페이지 만들기
- How to create plots in pandas?
- How to create new columns derived from existing columns?
- How to combine data from multiple tables?
import pandas as pd
# read_csv, read_exel, read_pdf...
data = pd.read_html('https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD%EC%9D%98_%EC%9D%B8%EA%B5%AC')
data[4]
인구수 = data[4]
사망자수 = 인구수[['사망자수(명)']]
사망자수
사망자수.sum()
사망자수.sum()[0]
format(10000000000000, ',')
format(사망자수.sum()[0], ',')
import pandas as pd
# 그러나 실제 데이터는 대부분 csv로 되어있어, dict로 다루실일이 많이 없을거에요.
df = pd.DataFrame(
{
"Name": [
"Braund, Mr. Owen Harris",
"Allen, Mr. William Henry",
"Bonnell, Miss. Elizabeth",
],
"Age": [22, 35, 58],
"Sex": ["male", "male", "female"],
}
)
df
- 시리즈는 데이터프레임에서 하나의 컬럼입니다.
df["Age"]
type(df['Age'])
type(df)
df[["Age"]]
type(df[["Age"]])
df["Age"].max()
df["Age"].min()
df["Age"].mean()
df["Age"].var()
df["Age"].std()
df.dtypes
df.describe()
titanic = pd.read_csv("train.csv")
titanic
titanic.head()
titanic.tail()
titanic.dtypes
titanic.to_excel("titanic.xlsx", sheet_name="passengers", index=False)
titanic_read_excel = pd.read_excel("titanic.xlsx", sheet_name="passengers")
titanic_read_excel
titanic.info()
titanic["Age"].shape
titanic["Sex"].shape
titanic[["Age", "Sex"]] # 괄호가 하나가 안되는 이유는 DataFrame이기 때문
type(titanic[["Age", "Sex"]])
titanic[["Age", "Sex"]].shape
above_35 = titanic[titanic["Age"] > 35]
above_35.head(10)
남자 = titanic[titanic["Sex"] == 'male']
남자.head(10)
남자.info()
titanic["Age"] > 35
(titanic["Age"] > 35).sum()
above_35.shape
남자.shape
titanic.shape
891 - 577 # 이렇게 사용하면 결측치가 있는 경우 제대로 나올 수 없기 때문에 꼭 비어있는 값이 있는지 확인해주세요.
titanic.info()
class_23 = titanic[(titanic["Pclass"] == 2) | (titanic["Pclass"] == 3)]
adult_names = titanic.loc[titanic["Age"] > 35, "Name"]
adult_names
adult_names = titanic.loc[titanic["Age"] > 35, ["Name", "Sex"]]
adult_names
titanic.iloc[9:25, 2:6]
import pandas as pd
# data = pd.read_html('https://ridibooks.com/category/bestsellers/2200')
# data
import requests
from bs4 import BeautifulSoup
url = 'https://ridibooks.com/category/bestsellers/2200' #수정
response = requests.get(url)
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html, 'html.parser')
bookservices = soup.select('.title_text') #수정
for no, book in enumerate(bookservices, 1):
print(no, book.text.strip())
import requests
from bs4 import BeautifulSoup
url = 'https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=1&ie=utf8&query=%EB%B0%95%EC%8A%A4%EC%98%A4%ED%94%BC%EC%8A%A4' #수정
response = requests.get(url)
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html, 'html.parser')
bookservices = soup.select('.name') #수정
for no, book in enumerate(bookservices, 1):
print(no, book.text.strip())
import requests
from bs4 import BeautifulSoup
url = 'https://ridibooks.com/category/bestsellers/2200' #수정
response = requests.get(url)
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html, 'html.parser')
bookservices = soup.select('.thumbnail') #수정
for no, book in enumerate(bookservices, 1):
print(no, book['alt'], 'https:' + book['data-src'])
import requests
from bs4 import BeautifulSoup
url = 'https://ridibooks.com/category/bestsellers/2200' #수정
response = requests.get(url)
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html, 'html.parser')
책순위 = []
책이름 = []
책이미지 = []
bookservices = soup.select('.thumbnail') #수정
for no, book in enumerate(bookservices, 1):
책순위.append(no)
책이름.append(book['alt'])
책이미지.append('https:' + book['data-src'])
책이미지
df = pd.DataFrame({
'책순위' : 책순위,
'책이름' : 책이름,
'책이미지' : 책이미지
})
df
df.to_html('index.html')
def 이미지변환(path):
return f'<img src="{path}" width="60" >'
df.to_html('index.html', escape=False, formatters=dict(책이미지=이미지변환))
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('train.csv')
df[['SibSp', 'Parch']].plot()
df.columns
df.plot.scatter(x="Age", y="Fare", alpha=0.5)
df[['Age']].plot.box()
df['Family'] = 1 + df['SibSp'] + df['Parch']
df
df['고향'] = '제주'
df
df["Age"].mean() # 평균
df[["Age", "Fare"]].median() # 중앙값
df[["Age", "Fare"]].describe() # 일반 통계치
df[["Sex", "Age"]].groupby("Sex").mean()
df.groupby("Sex").mean()
df.groupby("Sex")["Age"].mean()
df["Pclass"].value_counts()
df["Sex"].value_counts()
df.sort_values(by="Age").head() # 원본을 변경하지 않고 정렬, 오름차순
df.sort_values(by="Age", ascending=False).head() # 내림차순
df.sort_values(by=['Pclass', 'Age'], ascending=False).head()
여성 = df[df["Sex"] == "female"]
여성.head()
# sort_index는 index로 정렬하는 메서드입니다.
여성.sort_index().groupby(["Age"]).head(5)
여성.sort_index(ascending=False).groupby(["Age"]).head(5)
여성[::-1]
여성[:]
여성.pivot(index="PassengerId", columns="Pclass", values="Fare") # 데이터 재구조화
data = {
'수학' : [90, 80],
'영어' : [70, 60]
}
data2 = {
'언어' : [20, 70],
'과학' : [30, 60]
}
data3 = {
'수학' : [100, 90],
'영어' : [85, 65]
}
data = pd.DataFrame(data)
data2 = pd.DataFrame(data2)
data3 = pd.DataFrame(data3)
data
data + data2
data + data3
pd.concat([data, data2], axis=0)
data
data['언어'] = data2['언어']
# data['과학'] = data2['과학']
# data[['언어', '과학']] = data2[['언어', '과학']]
data
data['과학'] = data2['과학']
data
pd.concat([data, data2], axis=1)
data = {
'수학' : [90, 80],
'영어' : [70, 60]
}
data2 = {
'언어' : [20, 70],
'과학' : [30, 60]
}
data3 = {
'수학' : [100, 90],
'영어' : [85, 65]
}
data = pd.DataFrame(data)
data2 = pd.DataFrame(data2)
data3 = pd.DataFrame(data3)
pd.concat([data, data2], axis=1)
data = {
'이름' : ['영희', '철수', '호준'],
'수학' : [70, 60, 90]
}
data2 = {
'이름' : ['영희', '호준'],
'과학' : [50, 70],
'언어' : [90, 60]
}
data = pd.DataFrame(data)
data2 = pd.DataFrame(data2)
data
data2
merge = pd.merge(data, data2, how="left", on="이름")
merge
df = pd.DataFrame({'year': [2021, 2021],
'month': [7, 7],
'day': [9, 10]})
df
data = pd.to_datetime(df)
data
data = pd.to_datetime(df)
data
data.dt.year
data.dt.month
data.dt.day
data.dt.weekday
data.dt.day_name() #Series에서는 day_name(), weekday_name() - 버전업 되면서 삭제됨
pd.to_datetime('now') # UTC 시간
df = pd.read_csv('train.csv')
df.head()
df["Name"].str.lower()
df["Name"].str.split(",")
df["Name"].str.contains("Mr")
df["Name"].str.contains("Mr").value_counts()
df[df["Name"].str.contains("Mr")]
df["Sex"].replace({"male": 1, "female": 0})