데이터 분석

의사결정을 위한 데이터 인사이트 발굴
큰수의 법칙 : 데이터 수가 많아지면 우리가 원하는 확률에 근접하게 됩니다.
데이터분석의 도구 : Python, R

Python

변수

x = 3
y = 7
print(x + y)

10

type(x)

int

z = '10'

z + z

'1010'

# float
# str(string)

type(z)

str

int(z) + int(z) #형변환

20

# colab - google에서 개발한 jupyter notebook과 유사한 서비스
# python - 설치 후, power shell에서 python 파일명.py로 실행가능

name = 'leehojun'
age = 10
print('제 이름은', name, '이고 나이는 ', age, '입니다.')

제 이름은 leehojun 이고 나이는  10 입니다.

print('제 이름은' + name + '이고 나이는 ' + str(age) + '입니다.')

제 이름은leehojun이고 나이는 10입니다.

print('제 이름은 {}이고 제 나이는 {}입니다.'.format(name, age))

제 이름은 leehojun이고 제 나이는 10입니다.

print('제 이름은 {1}이고 제 나이는 {1}입니다.'.format(name, age))

제 이름은 10이고 제 나이는 10입니다.

print(f'제 이름은 {name}이고 제 나이는 {age}입니다.')

제 이름은 leehojun이고 제 나이는 10입니다.

연산자

# +, -, /, *, **, %

x = 3
y = 7

print(x + y)
print(x - y)
print(y / x) # float(실수)
print(y // x) # int(정수)
print(x * y)
print(y ** x) # 승수
print(y % x) # 나머지

10
-4
2.3333333333333335
2
21
343
1

x = 3
x = x + 7
x += 7
x -= 3

10

x = 3
y = 7
print(x > y)
print(x >= y)
print(x < y)
print(x <= y)
print(x == y)
print(x != y)

False
False
True
True
False
True

x = True # 1 
y = False # 0
# and 곱
# or 합

print(x and y)
print(x or y)
print(not y)

False
True
True

s = 0
for i in range(100):
    if i % 3 == 0 or i % 5 == 0:
        print(i)
        s += i
s

string

s = 'paullab ceo leehojun'
print(type(s))
print(dir(s))

<class 'str'>
['__add__', '__class__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mod__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'capitalize', 'casefold', 'center', 'count', 'encode', 'endswith', 'expandtabs', 'find', 'format', 'format_map', 'index', 'isalnum', 'isalpha', 'isascii', 'isdecimal', 'isdigit', 'isidentifier', 'islower', 'isnumeric', 'isprintable', 'isspace', 'istitle', 'isupper', 'join', 'ljust', 'lower', 'lstrip', 'maketrans', 'partition', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'split', 'splitlines', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper', 'zfill']

s.count('l')
s.find('c') # find는 없는 값을 만나면 -1을 출력합니다.
s.index('c') # index는 없는 값을 만나면 error를 출력합니다.
# s.format(name, age) -> '제 이름은 {}입니다. 제 나이는 {}입니다'.format(name, age)
'!'.join(['010', '5044', '2903'])
'-'.join(['010', '5044', '2903'])
s.replace('ceo', 'CEO')
s.split(' ')
'연도,제조사,모델,설명,가격'.split(',')
s.upper() # 대문자로, lower 소문자로
'1001'.zfill(10)

'0000001001'

# s[start:stop:step]
s = 'paullab CEO leehojun'
s[0] # 0이란 숫자를 index(0부터 시작)라고 부릅니다. 호출하는 것을 indexing
s[3:7] # 슬라이싱
s[-3:]
s[:]
s[::2]
s[::-1]

'nujoheel OEC balluap'

list

순서가 있는 데이터 집합
수정이 가능한 데이터 집합
순회가 가능한 데이터 집합

a = [10, 20, 30, 40]

print(type(a))
print(dir(a))

<class 'list'>
['__add__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__iadd__', '__imul__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__rmul__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'append', 'clear', 'copy', 'count', 'extend', 'index', 'insert', 'pop', 'remove', 'reverse', 'sort']

a.append(100)

a

[10, 20, 30, 40, 100, 100]

a.count(100)

2

a.index(40)

3

a.pop()
a

[10, 20, 30, 40, 100]

a.reverse()

a

[100, 40, 30, 20, 10]

a.sort()
a

[10, 20, 30, 40, 100]

x = [1, 5, 4, 2, 6, 7, 8 ]
sorted(x)

[1, 2, 4, 5, 6, 7, 8]

x

[1, 5, 4, 2, 6, 7, 8]

reversed(x)

<list_reverseiterator at 0x7fa0dcc31050>

list(reversed(x))

[8, 7, 6, 2, 4, 5, 1]

x = [10, 20, 30]
x[2]
x[1] = 1000
x

[10, 1000, 30]

s = 'hello world'
#s[0] = 'k'
#s
'k' + s[1:]

'kello world'

list(range(10))
list(range(5, 10))
list(range(5, 30, 2))
list(range(10, 5, -1))

[10, 9, 8, 7, 6]

str(list(range(10000))).count('8')

#'88'.count('8')

4000

str(list(range(10)))

'[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]'

dict(dictionary)

key와 value로 이뤄져 있습니다.
값의 수정이 가능합니다.
index로 호출할 수 없습니다. key로 호출해야 합니다.

d = {'one':1, 'two':2}
d

{'one': 1, 'two': 2}

d['one']

1

d['one'] = 100

d

{'one': 100, 'two': 2}

dir(d)

['__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'items',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values']

# d.keys()
d.values()

dict_values([100, 2])

tuple

순서가 있고 변경 불가능한 자료형

t = (10, 20, 30, 40)
t[0]

10

중간정리

python 자료형

int(정수)
float(실수)
string(문자열, indexing, slicing, 메서드)
list
dictionary
tuple

연산

+, -, /, //, *, **, % 산술연산
==, !=, <, <=, >, >= 비교연산
a += 10 할당연산

if 10 > 5:
    print('hello world')

hello world

if 10 < 5:
    print('hello world')

중간고사점수 = 89
용돈 = 10000
if 중간고사점수 > 90:
    용돈 += 1000000
    print('엄마 : 대단해!')
if 중간고사점수 > 80:
    용돈 += 100000
    print('엄마 : 오!')
if 중간고사점수 > 70:
    용돈 += 10000
    print('엄마 : 오?')
if 중간고사점수 > 60:
    용돈 += 1000
    print('엄마 : 대단해?')
    
print(용돈)

엄마 : 오!
엄마 : 오?
엄마 : 대단해?
121000

중간고사점수 = 89
용돈 = 10000
if 중간고사점수 > 90:
    용돈 += 1000000
    print('엄마 : 대단해!')
elif 중간고사점수 > 80:
    용돈 += 100000
    print('엄마 : 오!')
elif 중간고사점수 > 70:
    용돈 += 10000
    print('엄마 : 오?')
elif 중간고사점수 > 60:
    용돈 += 1000
    print('엄마 : 대단해?')
else:
    print('!!')
    용돈 = 0
    
print(용돈)

if False:
    print('one')
elif False:
    print('two')
else:
    print('three')

if True:
    print('one')
if True:
    print('two')
if False:
    print('three')

for i in range(10):
    print(i)
    print('hello')
print('end')

0
hello
1
hello
2
hello
3
hello
4
hello
5
hello
6
hello
7
hello
8
hello
9
hello
end

for i in 'hello world':
    print(i)
    print('hello')
print('end')

h
hello
e
hello
l
hello
l
hello
o
hello
 
hello
w
hello
o
hello
r
hello
l
hello
d
hello
end

for i in [10, 20, 30]:
    print(i)
    print('hello')
print('end')

10
hello
20
hello
30
hello
end

for i in {'one':1, 'two':2, 'three':3}:
    print(i)
    print('hello')
print('end')

one
hello
two
hello
three
hello
end

s = 0
for i in range(101):
    s += i
s

5050

s = 0
for i in range(0, 101, 2):
    s += i
s

2550

x = 0
while x < 10:
    print(x)
    x += 1

s = [10, 20, 30]
for i in s:
    print(i)

10
20
30

x = 0
while True:
    print('hello')
    x += 1
    if x > 10:
        break

hello
hello
hello
hello
hello
hello
hello
hello
hello
hello
hello

s = [10, 20, 30]
while s:
    print(s.pop())

30
20
10

함수

파선아실
함수를 사용하는 이유?
1. 구조 파악에 용이
2. 재사용
3. 유지보수

def add(x, y): # x, y를 파라미터라고 부릅니다.
    return x + y

add(10, 20) # 10과 20을 아규먼트라고 부릅니다.

30

# 땅파기()
# 땅파기()
# 땅파기()
# 땅파기()
# 땅다자기()
# 벽돌쌓기()
# 지붕올리기()
# 땅파기()

def 원넓이(반지름):
    return 반지름*반지름*3.14

원넓이(10)

314.0

hojun = print
hojun('hello world')

hello world

def 이호준10번출력하기():
    for i in range(10):
        print('이호준')
    # return 10

print(이호준10번출력하기())

이호준
이호준
이호준
이호준
이호준
이호준
이호준
이호준
이호준
이호준
None

클래스

# 인스턴스 : 자동차(붕어빵)
class Car():
    maxPeople = 6 # 맴버(클래스 변수, 인스턴스 변수)
    maxSpeed = 300
    def start(self): # 매서드
        print('출발합니다!')
    def stop(self):
        print('멈춥니다!!')

k5 = Car()
k5.maxPeople
k5.start()

k3 = Car()
k3.start()

출발합니다!
출발합니다!

l = [10, 20, 30, 1, 2, 3]
l.sort()

모듈

import test

test.age

10

test.name

'leehojun'

import test as t

t.name

'leehojun'

Numpy

import numpy as np

s = 10

s = [10, 20, 30, 40]
s = [10]

s = [[1, 2, 3],
     [4, 5, 6],
     [7, 8, 9]]

s[0]
s[0][0]

1

s = [[[1, 2], [3, 4]],
     [[1, 2], [3, 4]]]

s = [[1, 2, 3],
     [4, 5, 6],
     [7, 8, 9]]

for i in range(3): #range(len(s))
    for j in range(3):
        # print(s[i][j])
        s[i][j] *= 2

print(s)

[[2, 4, 6], [8, 10, 12], [14, 16, 18]]

s = [[1, 2, 3],
     [4, 5, 6],
     [7, 8, 9]]

a = np.array(s)

a * 2

array([[ 2,  4,  6],
       [ 8, 10, 12],
       [14, 16, 18]])

s * 2

[[1, 2, 3], [4, 5, 6], [7, 8, 9], [1, 2, 3], [4, 5, 6], [7, 8, 9]]

a.shape

(3, 3)

a.ndim

2

a.dtype.name

'int64'

a.size

9

type(a)

numpy.ndarray

test = np.arange(15).reshape(3, 5)
test # step 값에 실수도 해준다!

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

test = np.arange(160).reshape(2, 4, 4, 5)
test # step 값에 실수도 해준다!

array([[[[  0,   1,   2,   3,   4],
         [  5,   6,   7,   8,   9],
         [ 10,  11,  12,  13,  14],
         [ 15,  16,  17,  18,  19]],

        [[ 20,  21,  22,  23,  24],
         [ 25,  26,  27,  28,  29],
         [ 30,  31,  32,  33,  34],
         [ 35,  36,  37,  38,  39]],

        [[ 40,  41,  42,  43,  44],
         [ 45,  46,  47,  48,  49],
         [ 50,  51,  52,  53,  54],
         [ 55,  56,  57,  58,  59]],

        [[ 60,  61,  62,  63,  64],
         [ 65,  66,  67,  68,  69],
         [ 70,  71,  72,  73,  74],
         [ 75,  76,  77,  78,  79]]],


       [[[ 80,  81,  82,  83,  84],
         [ 85,  86,  87,  88,  89],
         [ 90,  91,  92,  93,  94],
         [ 95,  96,  97,  98,  99]],

        [[100, 101, 102, 103, 104],
         [105, 106, 107, 108, 109],
         [110, 111, 112, 113, 114],
         [115, 116, 117, 118, 119]],

        [[120, 121, 122, 123, 124],
         [125, 126, 127, 128, 129],
         [130, 131, 132, 133, 134],
         [135, 136, 137, 138, 139]],

        [[140, 141, 142, 143, 144],
         [145, 146, 147, 148, 149],
         [150, 151, 152, 153, 154],
         [155, 156, 157, 158, 159]]]])

np.zeros((3, 4))

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

np.ones((3, 4))

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

np.linspace(0, 2, 9) # 0부터 2까지 9개

array([0.  , 0.25, 0.5 , 0.75, 1.  , 1.25, 1.5 , 1.75, 2.  ])

import random

random.randint(0, 10)

9

a = []
for i in range(100):
    a.append(random.randint(1, 6))

a.count(3)

9

a = np.random.rand(5)
print(a)

[0.3992425  0.79793753 0.79281414 0.40914541 0.78105812]

a*10

array([3.99242503, 7.97937526, 7.92814136, 4.09145414, 7.81058117])

a*7

array([2.79469752, 5.58556268, 5.54969895, 2.8640179 , 5.46740682])

np.random.random((2, 3))

array([[0.75766853, 0.45178975, 0.40509635],
       [0.41185586, 0.96221473, 0.59624531]])

np.arange(15).reshape(3, 5)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

np.arange(15).reshape(3, 5).min()

0

np.arange(15).reshape(3, 5).max()

14

np.arange(15).reshape(3, 5).sum()

105

np.arange(15).reshape(3, 5).mean()

7.0

np.arange(15).reshape(3, 5).var()

18.666666666666668

np.arange(15).reshape(3, 5).std()

4.320493798938574

a = np.arange(15).reshape(3, 5)
a

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

a.sum(axis=1)

array([10, 35, 60])

a.sum(axis=0)

array([15, 18, 21, 24, 27])

a = np.floor(10 * np.random.rand(2, 3))
a

array([[3., 8., 0.],
       [0., 5., 5.]])

np.arange(15).reshape(3, 5)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

np.arange(15).reshape(3, 5).T

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

(쉬어가기) 이미지 시각화

import numpy as np
from skimage import io
import matplotlib.pyplot as plt

jeju = io.imread('jeju.jpg')

type(jeju)

numpy.ndarray

jeju.shape

(1440, 1920, 3)

jeju

array([[[170, 223, 254],
        [170, 223, 254],
        [169, 222, 253],
        ...,
        [123, 187, 251],
        [123, 187, 251],
        [123, 187, 251]],

       [[170, 223, 254],
        [170, 223, 254],
        [170, 223, 254],
        ...,
        [123, 187, 251],
        [123, 187, 251],
        [123, 187, 251]],

       [[171, 224, 255],
        [171, 224, 255],
        [170, 223, 254],
        ...,
        [123, 187, 251],
        [123, 187, 251],
        [123, 187, 251]],

       ...,

       [[ 60,  80,  21],
        [ 34,  54,   0],
        [ 23,  43,   0],
        ...,
        [ 13,  38,   0],
        [ 21,  43,   5],
        [ 19,  41,   5]],

       [[ 33,  50,   0],
        [ 32,  49,   0],
        [ 47,  65,   5],
        ...,
        [ 12,  37,   0],
        [  8,  32,   0],
        [  7,  31,   0]],

       [[ 41,  55,   2],
        [ 51,  66,  11],
        [ 62,  80,  22],
        ...,
        [ 40,  65,  25],
        [ 29,  53,  17],
        [ 21,  45,   9]]], dtype=uint8)

plt.imshow(jeju)

<matplotlib.image.AxesImage at 0x7fa0dcb80590>

data = jeju[:]

x = [1, 2, 3, 4, 5]
x[::-1]

[5, 4, 3, 2, 1]

plt.imshow(data[::-1])

<matplotlib.image.AxesImage at 0x7fa0d05d9710>

plt.imshow(jeju[:, ::-1])

<matplotlib.image.AxesImage at 0x7fa0d058a0d0>

plt.imshow(jeju[800:1200, 700:1150])

<matplotlib.image.AxesImage at 0x7fa0d04f3210>

plt.imshow(jeju[::5, ::5])
plt.imshow(jeju[::10, ::10])
plt.imshow(jeju[::30, ::30])

<matplotlib.image.AxesImage at 0x7fa0d03bb6d0>