20230313~20230318 WIL

1. 이번주 리뷰

1)EC2 ubuntu 서버에 python selenium 돌리기 : https://incomeplus.tistory.com/266

AWS에 크롬드라이버 설치 selenium 돌리기

아마존 EC2에서 셀레니움 작업을 하기 위해서는 서버에 크롬드라이버 및 구글 크롬 브라우저를 설치 해줘야 한다. 파이썬으로 셀레니움을 아마존 EC2에서 돌려보자. 아래 명령어를 실행하는 기본

incomeplus.tistory.com

2)colab notebook 으로 python selenium 돌리기

각 단계별로 나누었으며, 위에부터 아래로 수행해야한다
알라딘 크롤링에 최적화 되있으며 포맷만 바꿔 쓰자.

!sudo add-apt-repository ppa:saiarcot895/chromium-beta

!sudo apt remove chromium-browser
!sudo snap remove chromium

!sudo apt install chromium-browser

# 이 부분은 처음 한번만 실행하면 됌.
# 코드 수정 - "The reason is that the last Ubuntu update update supports chromium driver just via snap."
# 최근 우분투 업데이트에서 크롬 드라이버 설치를 snap을 이용해서만 하도록 바뀜
# 고로 snap 없이 설치하는 아래 우회 코드로 변경
# 출처 : https://colab.research.google.com/drive/1cbEvuZOhkouYLda3RqiwtbM-o9hxGLyC
# 출처2 : https://stackoverflow.com/questions/75155063/selenium-use-chrome-on-colab-got-unexpectedly-exited

%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd

def hasEvent():
    try:
        event = book.find_element(By.CLASS_NAME,'ss_book_list').find_element(By.CLASS_NAME, 'ss_ht1').text
        return 1
    except Exception:
        return 0
def getFirst(li):
    title = li.find_element(By.TAG_NAME, 'a').text
    try:
        description = li.find_element(By.CLASS_NAME, 'ss_f_g2').text[2:]
    except Exception:
        description = ''
    return [title, description]
def getSecond(li):
    lis = li.find_elements(By.TAG_NAME, 'a')
    author = lis[0].text
    publish = lis[len(lis)-1].text
    date = li.text[len(li.text)-9:].strip()
    return [author, publish, date]
def getThird(li):
    price = li.find_element(By.TAG_NAME, 'span').text
    return price
def getfourth(li):
    try:
        star = li.find_element(By.TAG_NAME, 'img').get_attribute('src')
        star = star[len(star)-6:len(star)-4]
        star = star if star[0] != 's' else star[1:]
        comment_number = li.find_element(By.TAG_NAME, 'a').text
    except Exception:
        star = '0'
        comment_number = '0'
    return [star, comment_number]
def getImages():
    try:
        images = book.find_element(By.CLASS_NAME, 'cover_area').find_elements(By.TAG_NAME,'img')
        if(len(images) == 2):
            image = images[1].get_attribute('src')
        else:
            image = images[0].get_attribute('src')
    except Exception:
        image = book.find_element(By.CLASS_NAME, 'cover_area_other').find_element(By.TAG_NAME,'img').get_attribute('src')
    return image

options = webdriver.ChromeOptions()
options.add_argument('--headless')        # Head-less 설정
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=options)


cid = 90833
per_page = 200
print("cid : " , cid)
url = 'https://www.aladin.co.kr/shop/wbrowse.aspx?CID={0}&start=we_tab'.format(cid)
driver.get(url)
big = driver.find_element(By.CLASS_NAME,'nbtit_w').text
cates = driver.find_element(By.CLASS_NAME, 'br2010_menu').find_elements(By.TAG_NAME, 'a')
sp = big.split("/")
if len(sp)>1:
    big =""
    for s in sp:
        big += s

name = '/content/drive/MyDrive/booksbooks/'+'외국'+'bookdata_'+big+'2'+'.csv'
print(name)
# newfrom = pd.DataFrame(columns=['category','babycategory','title','price','star','author','publish','image','year','month','inventory'])
# newfrom.to_csv(name,encoding='utf-8-sig')
lili = []

for cat in cates:
  li = [cat.text , cat.get_attribute('href').split('"')[1]]
  lili.append(li)


passpoint = 0
print(lili)
for cate in lili:
    # driver = webdriver.Chrome('chromedriver', options=options)
    try:
        if cate[0] == "" :
            continue
        else:
            small = cate[0]
            print("카테고리류 : ", small)
            smallnum = cate[1]
            print("카테고리 넘버 : ", smallnum)


            url3 = 'https://www.aladin.co.kr/shop/wbrowse.aspx?BrowseTarget=List&ViewRowsCount=25&ViewType=Detail&PublishMonth=0&SortOrder=2&page=1&Stockstatus=1&PublishDay=84&CID={0}&SearchOption='.format(smallnum)
            driver.get(url3)
            total = int(driver.find_element(By.CLASS_NAME, 'search_t_g').find_element(By.TAG_NAME, 'strong').text.replace(',',''))
            pages = int(total / per_page)+2
            # pages = 3
            # driver.close()


            # passlist = ["각색","논픽션","대하/서사","로맨스"]
            # if small in passlist :
            #   continue

            check = 1

            if small=="스포츠/레크레이션":
              passpoint = 1
              check = 184
              pages = 388
            if passpoint == 0:
              print(small+"은 건너 뜀")
              continue

            # if pages>400 :
            #   pages = 400

            # if small=="스포츠/레크레이션":
            #   pages = 388

            for page in range(check,pages) :
                # driver = webdriver.Chrome('chromedriver', options=options)
                url2 = 'https://www.aladin.co.kr/shop/wbrowse.aspx?BrowseTarget=List&ViewRowsCount={0}&ViewType=Detail&PublishMonth=0&SortOrder=2&page={1}&Stockstatus=1&PublishDay=84&CID={2}&SearchOption='.format(per_page, page, smallnum)
                driver.get(url2)
                books = driver.find_element(By.ID, 'Myform').find_elements(By.CLASS_NAME, 'ss_book_box')

                bookdata = pd.read_csv(name,index_col=0,header=0)


                cnt = len(bookdata)
                print(big,"에서 ",small,"에서 ",page,"번째 페이지 진입")
                for book in books:
                    try:
                      index = hasEvent()
                      lis = book.find_element(By.CLASS_NAME, 'ss_book_list').find_elements(By.TAG_NAME, 'li')
                      firstLi = getFirst(lis[0 + index])
                      secondLi = getSecond(lis[1 + index])
                      thridLi = getThird(lis[2 + index])
                      try:
                        fourthLi = getfourth(lis[3 + index])
                      except:
                        fourthLi = [0,1]
                        # fourthLi[0] = 0
                      print(cnt)
                      year = secondLi[2].split(' ')[0][0:-1]
                      y = secondLi[2].split(" ")[0][0:-1]

                      m = secondLi[2].split(" ")[1][0:-1]

                      bookdata.loc[cnt] = [big,small,firstLi[0],thridLi,fourthLi[0],secondLi[0],secondLi[1],getImages(),y,m,0]
                      cnt += 1
                    except:
                      print("여기서 하나 패스됨")

                # driver.close()
                try:
                    bookdata.to_csv(name, encoding='utf-8-sig')
                    print("저장 성공!")
                except Exception:
                    print("저장실패!")

    except Exception:
        print(big,"에서", cate[0],"패스됨")

3)서버 이론

Redis
ElasticSearch
CI/CD
Docker
Logstash
Apache/ Tomcat/ Apache Kafka/Apache Lucene
Kubernetes

저작자표시

codebylhbs

20230313~20230318 WIL

1. 이번주 리뷰

티스토리툴바