Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ymnseol committed Aug 5, 2021
0 parents commit d33612c
Show file tree
Hide file tree
Showing 4 changed files with 241 additions and 0 deletions.
124 changes: 124 additions & 0 deletions index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import pandas as pd
from selenium import webdriver
import time

def keyword_crawling(keyword): # 테스트를 위한 간소화 상태
rel_search = []
user_keyword = keyword

driver = webdriver.Chrome('/Users/seolyumin/chromedriver')
url = "https://m.some.co.kr/analysis/keyword"
driver.get(url)

search_box = driver.find_element_by_xpath('/html/body/div[2]/div[1]/section/div/div/section[1]/div/input')
search_box.send_keys(user_keyword)

click_box = driver.find_element_by_xpath('/html/body/div[2]/div[1]/section/div/div/section[3]/a')
click_box.click()

time.sleep(3)

table = driver.find_element_by_xpath('/html/body/div[2]/section[2]/div/div/section/div[2]/div[2]/div[3]/div/div/div[5]/div/div/div/div/table/tbody')

for tr in table.find_elements_by_tag_name('tr'):
td = tr.find_elements_by_tag_name('td')
rel_search.append(td[1].text)

driver.quit()

return rel_search

def movie_crawling():
url_login = 'https://nid.naver.com/nidlogin.login'
id = '' # 네이버 아이디 입력
pw = '' # 네이버 비번 입력

driver = webdriver.Chrome('/Users/seolyumin/chromedriver') # 크롬드라이버 위치 입력
driver.get(url_login)
driver.implicitly_wait(2)

# execute_script 함수 사용하여 자바스크립트로 id,pw 넘겨주기
driver.execute_script("document.getElementsByName('id')[0].value=\'" + id + "\'")
driver.execute_script("document.getElementsByName('pw')[0].value=\'" + pw + "\'")

# 로그인 버튼 클릭하기
driver.find_element_by_xpath('//*[@id="frmNIDLogin"]/fieldset/input').click()
time.sleep(1)

# 브라우저 등록 / 등록 안 함
driver.find_element_by_xpath('/html/body/div[1]/div[2]/div/form/fieldset/span[2]/a').click()
time.sleep(1)

base_url = 'https://serieson.naver.com/movie/categoryList.nhn?categoryCode=100005&orderType=sale&sortingType=&mobileYn=&drmFreeYn=&freeYn=&discountYn=&tagCode=1&page={}'

comment_list = []
for page in range(1, 2):
url = base_url.format(page)
driver.get(url)
time.sleep(3)
for ul in range(1, 6):
for li in range(1, 6):
driver.find_element_by_xpath(
"/html/body/div[2]/div[2]/div[2]/div/ul[" + str(ul) + "]/li[" + str(li) + "]/a").click()
title = driver.find_element_by_xpath("/html/body/div[2]/div[2]/div[2]/div[1]/h2").text
rate = driver.find_element_by_xpath("/html/body/div[2]/div[2]/div[2]/div[1]/div[1]/em").text
plot = driver.find_element_by_xpath("/html/body/div[2]/div[2]/div[2]/div[4]").text
comment_list.append([title.replace("HD(1080) 19 ", "").replace("HD(1080) ", "").replace("19 ", ""), rate, plot])
driver.back()
time.sleep(1)

df = pd.DataFrame(comment_list, columns=['title', 'rate', 'plot']) # 오류 해결하고 줄거리도 추가할 예정입니다
df.to_csv('navercommenttt.csv', encoding='utf-8', index=False)
file1 = pd.read_csv('navercommenttt.csv')
print(file1)


def recommendation(keywords):
list_flag = [0 for i in range(len(keywords))]
flag_num = [0 for i in range(len(keywords))]
file1 = pd.read_csv("/Users/seolyumin/Hackathon-selenium/navercommenttt.csv", header=None, names=["title", "rate", "plot"])
title = list(file1['title'])
rate = list(file1['rate'])
plot = list(file1['plot'])
priority = [0 for i in range(len(title))]
priority_index = [0 for i in range(len(priority))]
for k in keywords:
for i in plot:
if k in str(i):
list_flag[keywords.index(k)] = 1
priority[plot.index(i)] += 1

flag_num = priority[:]

for i in range(len(priority)):
priority_index[i] = priority.index(max(priority))
priority[priority.index(max(priority))] = -1

for i in range(len(priority_index)):
if flag_num[priority_index[i]] > 0:
print(title[priority_index[i]], "# 키워드", flag_num[priority_index[i]], "개 포함 |", "당신과의 찰떡 지수:", round(flag_num[priority_index[i]] / len(keywords) * 100, 1), "%")

#movie_crawling()

derived_keywords = []
selected_keywords = []

print("키워드를 선택해주세요: ")
print("범죄 / 자연재해 / 괴물 / 오컬트") # 키워드 수정 바람
keywords1 = list(map(str, input().split()))
selected_keywords.extend(keywords1)

for word in keywords1:
derived_keywords.extend(keyword_crawling(word))

print("세부 키워드를 선택해주세요: ")
print("**********************************")
[print(word, end=' ') for word in derived_keywords]
print()
print("**********************************")
keywords2 = list(map(str, input().split()))
for word in keywords2:
word.replace("\x7f", "")
selected_keywords.extend(keywords2)

recommendation(selected_keywords)
44 changes: 44 additions & 0 deletions keyword_crawling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from selenium import webdriver

keyword_list = []
user_keyword = input('원하시는 키워드를 입력하세요.')

while user_keyword != '':

driver = webdriver.Chrome('C:\\Users\\chromedriver')
url = "https://m.some.co.kr/analysis/keyword"
driver.get(url)

# 검색창에 키워드 입력
search_box = driver.find_element_by_xpath('/html/body/div[2]/div[1]/section/div/div/section[1]/div/input')
search_box.send_keys(user_keyword)

# 검색버튼 클릭
click_box = driver.find_element_by_xpath('/html/body/div[2]/div[1]/section/div/div/section[3]/a')
click_box.click()

# 웹페이지 요소 생성 대기시간
driver.implicitly_wait(10)

rel_search = []

table = driver.find_element_by_xpath('/html/body/div[2]/section[2]/div/div/section/div[2]/div[2]/div[3]/div/div/div[5]/div/div/div/div/table/tbody')

for tr in table.find_elements_by_tag_name('tr'):
td = tr.find_elements_by_tag_name('td')
rel_search.append(td[1].text + ' - ' + td[2].text)

print('입력하신 키워드의 연관 검색어입니다.')
for keyword in rel_search:
print(keyword)

driver.quit()

add_keyword = input('현재 검색어를 키워드로 등록하시겠습니까?(y/n)')

if add_keyword == 'y':
keyword_list.append(user_keyword)

user_keyword = input('계속 검색하시려면 키워드를 입력해주세요.') # 추가 키워드 입력 없을 시 종료

print(keyword_list)
46 changes: 46 additions & 0 deletions movie_crawling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from selenium import webdriver
import time


url_login = 'https://nid.naver.com/nidlogin.login'
id = '' # 네이버 아이디 입력
pw = '' # 네이버 비번 입력

driver = webdriver.Chrome('/Users/seolyumin/chromedriver') # 크롬드라이버 위치 입력
driver.get(url_login)
driver.implicitly_wait(2)

# execute_script 함수 사용하여 자바스크립트로 id,pw 넘겨주기
driver.execute_script("document.getElementsByName('id')[0].value=\'" + id + "\'")
driver.execute_script("document.getElementsByName('pw')[0].value=\'" + pw + "\'")

# 로그인 버튼 클릭하기
driver.find_element_by_xpath('//*[@id="frmNIDLogin"]/fieldset/input').click()
time.sleep(1)

# 브라우저 등록 / 등록 안 함
driver.find_element_by_xpath('/html/body/div[1]/div[2]/div/form/fieldset/span[2]/a').click()
time.sleep(1)

base_url = 'https://serieson.naver.com/movie/categoryList.nhn?categoryCode=100005&orderType=sale&sortingType=&mobileYn=&drmFreeYn=&freeYn=&discountYn=&tagCode=1&page={}'

comment_list = []
for page in range(1, 2):
url = base_url.format(page)
driver.get(url)
time.sleep(3)
for ul in range(1, 6):
for li in range(1, 6):
driver.find_element_by_xpath(
"/html/body/div[2]/div[2]/div[2]/div/ul[" + str(ul) + "]/li[" + str(li) + "]/a").click()
title = driver.find_element_by_xpath("/html/body/div[2]/div[2]/div[2]/div[1]/h2").text
rate = driver.find_element_by_xpath("/html/body/div[2]/div[2]/div[2]/div[1]/div[1]/em").text
plot = driver.find_element_by_xpath("/html/body/div[2]/div[2]/div[2]/div[4]").text
comment_list.append([title.replace("HD(1080) 19 ", "").replace("HD(1080) ", "").replace("19 ", ""), rate, plot])
driver.back()
time.sleep(1)

df = pd.DataFrame(comment_list, columns=['title', 'rate', 'plot'])
df.to_csv('navercommenttt.csv', encoding='utf-8', index=False)
file1 = pd.read_csv('navercommenttt.csv')
print(file1)
27 changes: 27 additions & 0 deletions pandas_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pandas as pd

test_list = ['폐쇄', '산사태', '해일', '가족'] # 자연재해 -> 폐쇄, 산사태, 해일, 가족
list_flag = [0 for i in range(len(test_list))]
flag_num = [0 for i in range(len(test_list))]
file1 = pd.read_csv("/Users/seolyumin/Hackathon-selenium/csv_test2.csv", header=None, names=["title", "rate", "plot"])
title = list(file1['title'])
rate = list(file1['rate'])
plot = list(file1['plot'])
priority = [0 for i in range(len(title))]
priority_index = [0 for i in range(len(priority))]

for k in test_list:
for i in plot:
if k in i:
list_flag[test_list.index(k)] = 1
priority[plot.index(i)] += 1

flag_num = priority[:]

for i in range(len(priority)):
priority_index[i] = priority.index(max(priority))
priority[priority.index(max(priority))] = -1

for i in range(len(priority_index)):
if flag_num[priority_index[i]] > 0:
print(title[priority_index[i]], "# 키워드", flag_num[priority_index[i]], "개 포함 |", "당신과의 찰떡 지수:", flag_num[priority_index[i]] / len(test_list) * 100, "%")

0 comments on commit d33612c

Please sign in to comment.