2021년 여름방학 특강 크롤링 데이터 저장하기
이미지(img), 데이터(DataFrame) 등 외부 데이터 저장하기
import requests
from bs4 import BeautifulSoup
url = '수정필요'
response = requests.get(url)
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html, 'html.parser')
datagroup = soup.select('수정필요') # 클래스 '.' , 아이디 '#'
for no, data in enumerate(datagroup, 1):
print(no, data.text)
import requests
from bs4 import BeautifulSoup
url = 'https://ridibooks.com/category/bestsellers/2200'
response = requests.get(url)
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html, 'html.parser')
datagroup = soup.select('.title_text') # 클래스 '.' , 아이디 '#'
for no, data in enumerate(datagroup, 1):
print(no, data.text.strip())
import shutil # 고수준의 파일연산 라이브러리
import requests # request를 보낼 수 있는 라이브러리
url = 'https://img.ridicdn.net/cover/194000109/xxlarge#1'
r = requests.get(url, stream=True)
with open('sample.jpg', 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
import requests
from bs4 import BeautifulSoup
r = requests.get('https://img.ridicdn.net/cover/194000109/xxlarge#1', stream=True)
if r.status_code == 200:
with open('test.jpg', 'wb') as f:
for data in r.iter_content(1024):
f.write(data)
import requests
from bs4 import BeautifulSoup
url = 'https://ridibooks.com/category/bestsellers/2200'
response = requests.get(url)
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html, 'html.parser')
datagroup = soup.select('.thumbnail') # 클래스 '.' , 아이디 '#'
for data in datagroup:
print('https:' + data['data-src'][:-7] + 'xxlarge#1')
'https:' + datagroup[0]['data-src']
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://ridibooks.com/category/bestsellers/2200'
response = requests.get(url)
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html, 'html.parser')
책순위 = []
책제목 = []
책이미지 = []
책제목크롤링 = soup.select('.title_text')
for no, data in enumerate(책제목크롤링, 1):
책순위.append(no)
책제목.append(data.text.strip())
책이미지크롤링 = soup.select('.thumbnail')
for data in 책이미지크롤링:
책이미지.append('https:' + data['data-src'][:-7] + 'xxlarge#1')
데이터 = {
'책순위':책순위,
'책제목':책제목,
'책이미지':책이미지
}
데이터 = pd.DataFrame(데이터)
데이터
def 이미지양식변환(path):
return '<img src="'+ path + '" width="100px" >'
데이터.to_html('index.html', escape=False, formatters=dict(책이미지=이미지양식변환))
s = ''
for i, j, k in zip(책순위, 책제목, 책이미지):
s += f'<tr>\
<td>{i}</td>\
<td>{j}</td>\
<td><img src={k}></td>\
</tr>'
s = '<table>' + s + '</table>'
with open("노동.html", "w") as f:
f.write(s)
import requests
from bs4 import BeautifulSoup
import shutil # 고수준의 파일연산 라이브러리
url = 'https://ridibooks.com/category/bestsellers/2200'
response = requests.get(url)
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html, 'html.parser')
datagroup = soup.select('.thumbnail') # 클래스 '.' , 아이디 '#'
for data in datagroup:
print('https:' + data['data-src'][:-7] + 'xxlarge#1')
'''
url = 'https://img.ridicdn.net/cover/194000109/xxlarge#1'
r = requests.get(url, stream=True)
with open('sample.jpg', 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
'''
import requests
from bs4 import BeautifulSoup
import shutil # 고수준의 파일연산 라이브러리
url = 'https://ridibooks.com/category/bestsellers/2200'
response = requests.get(url)
response.encoding = 'utf-8'
html = response.text
soup = BeautifulSoup(html, 'html.parser')
datagroup = soup.select('.thumbnail') # 클래스 '.' , 아이디 '#'
filename = 0
for data in datagroup:
url = 'https:' + data['data-src'][:-7] + 'xxlarge#1'
r = requests.get(url, stream=True)
#성산일출봉 폴더에 다운로드 받아 저장합니다.
with open('성산일출봉/' + str(filename) + '.jpg', 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
filename += 1