1. 이번주 리뷰
1)EC2 ubuntu 서버에 python selenium 돌리기 : https://incomeplus.tistory.com/266
2)colab notebook 으로 python selenium 돌리기
- 각 단계별로 나누었으며, 위에부터 아래로 수행해야한다
- 알라딘 크롤링에 최적화 되있으며 포맷만 바꿔 쓰자.
!sudo add-apt-repository ppa:saiarcot895/chromium-beta
!sudo apt remove chromium-browser
!sudo snap remove chromium
!sudo apt install chromium-browser
# 이 부분은 처음 한번만 실행하면 됌.
# 코드 수정 - "The reason is that the last Ubuntu update update supports chromium driver just via snap."
# 최근 우분투 업데이트에서 크롬 드라이버 설치를 snap을 이용해서만 하도록 바뀜
# 고로 snap 없이 설치하는 아래 우회 코드로 변경
# 출처 : https://colab.research.google.com/drive/1cbEvuZOhkouYLda3RqiwtbM-o9hxGLyC
# 출처2 : https://stackoverflow.com/questions/75155063/selenium-use-chrome-on-colab-got-unexpectedly-exited
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap
# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF
# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg
# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500
Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300
Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF
# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver
# Install selenium
pip install selenium
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
def hasEvent():
try:
event = book.find_element(By.CLASS_NAME,'ss_book_list').find_element(By.CLASS_NAME, 'ss_ht1').text
return 1
except Exception:
return 0
def getFirst(li):
title = li.find_element(By.TAG_NAME, 'a').text
try:
description = li.find_element(By.CLASS_NAME, 'ss_f_g2').text[2:]
except Exception:
description = ''
return [title, description]
def getSecond(li):
lis = li.find_elements(By.TAG_NAME, 'a')
author = lis[0].text
publish = lis[len(lis)-1].text
date = li.text[len(li.text)-9:].strip()
return [author, publish, date]
def getThird(li):
price = li.find_element(By.TAG_NAME, 'span').text
return price
def getfourth(li):
try:
star = li.find_element(By.TAG_NAME, 'img').get_attribute('src')
star = star[len(star)-6:len(star)-4]
star = star if star[0] != 's' else star[1:]
comment_number = li.find_element(By.TAG_NAME, 'a').text
except Exception:
star = '0'
comment_number = '0'
return [star, comment_number]
def getImages():
try:
images = book.find_element(By.CLASS_NAME, 'cover_area').find_elements(By.TAG_NAME,'img')
if(len(images) == 2):
image = images[1].get_attribute('src')
else:
image = images[0].get_attribute('src')
except Exception:
image = book.find_element(By.CLASS_NAME, 'cover_area_other').find_element(By.TAG_NAME,'img').get_attribute('src')
return image
options = webdriver.ChromeOptions()
options.add_argument('--headless') # Head-less 설정
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=options)
cid = 90833
per_page = 200
print("cid : " , cid)
url = 'https://www.aladin.co.kr/shop/wbrowse.aspx?CID={0}&start=we_tab'.format(cid)
driver.get(url)
big = driver.find_element(By.CLASS_NAME,'nbtit_w').text
cates = driver.find_element(By.CLASS_NAME, 'br2010_menu').find_elements(By.TAG_NAME, 'a')
sp = big.split("/")
if len(sp)>1:
big =""
for s in sp:
big += s
name = '/content/drive/MyDrive/booksbooks/'+'외국'+'bookdata_'+big+'2'+'.csv'
print(name)
# newfrom = pd.DataFrame(columns=['category','babycategory','title','price','star','author','publish','image','year','month','inventory'])
# newfrom.to_csv(name,encoding='utf-8-sig')
lili = []
for cat in cates:
li = [cat.text , cat.get_attribute('href').split('"')[1]]
lili.append(li)
passpoint = 0
print(lili)
for cate in lili:
# driver = webdriver.Chrome('chromedriver', options=options)
try:
if cate[0] == "" :
continue
else:
small = cate[0]
print("카테고리류 : ", small)
smallnum = cate[1]
print("카테고리 넘버 : ", smallnum)
url3 = 'https://www.aladin.co.kr/shop/wbrowse.aspx?BrowseTarget=List&ViewRowsCount=25&ViewType=Detail&PublishMonth=0&SortOrder=2&page=1&Stockstatus=1&PublishDay=84&CID={0}&SearchOption='.format(smallnum)
driver.get(url3)
total = int(driver.find_element(By.CLASS_NAME, 'search_t_g').find_element(By.TAG_NAME, 'strong').text.replace(',',''))
pages = int(total / per_page)+2
# pages = 3
# driver.close()
# passlist = ["각색","논픽션","대하/서사","로맨스"]
# if small in passlist :
# continue
check = 1
if small=="스포츠/레크레이션":
passpoint = 1
check = 184
pages = 388
if passpoint == 0:
print(small+"은 건너 뜀")
continue
# if pages>400 :
# pages = 400
# if small=="스포츠/레크레이션":
# pages = 388
for page in range(check,pages) :
# driver = webdriver.Chrome('chromedriver', options=options)
url2 = 'https://www.aladin.co.kr/shop/wbrowse.aspx?BrowseTarget=List&ViewRowsCount={0}&ViewType=Detail&PublishMonth=0&SortOrder=2&page={1}&Stockstatus=1&PublishDay=84&CID={2}&SearchOption='.format(per_page, page, smallnum)
driver.get(url2)
books = driver.find_element(By.ID, 'Myform').find_elements(By.CLASS_NAME, 'ss_book_box')
bookdata = pd.read_csv(name,index_col=0,header=0)
cnt = len(bookdata)
print(big,"에서 ",small,"에서 ",page,"번째 페이지 진입")
for book in books:
try:
index = hasEvent()
lis = book.find_element(By.CLASS_NAME, 'ss_book_list').find_elements(By.TAG_NAME, 'li')
firstLi = getFirst(lis[0 + index])
secondLi = getSecond(lis[1 + index])
thridLi = getThird(lis[2 + index])
try:
fourthLi = getfourth(lis[3 + index])
except:
fourthLi = [0,1]
# fourthLi[0] = 0
print(cnt)
year = secondLi[2].split(' ')[0][0:-1]
y = secondLi[2].split(" ")[0][0:-1]
m = secondLi[2].split(" ")[1][0:-1]
bookdata.loc[cnt] = [big,small,firstLi[0],thridLi,fourthLi[0],secondLi[0],secondLi[1],getImages(),y,m,0]
cnt += 1
except:
print("여기서 하나 패스됨")
# driver.close()
try:
bookdata.to_csv(name, encoding='utf-8-sig')
print("저장 성공!")
except Exception:
print("저장실패!")
except Exception:
print(big,"에서", cate[0],"패스됨")
3)서버 이론
- Redis
- ElasticSearch
- CI/CD
- Docker
- Logstash
- Apache/ Tomcat/ Apache Kafka/Apache Lucene
- Kubernetes