diff --git a/apps/pre-processing-service/app/api/endpoints/product.py b/apps/pre-processing-service/app/api/endpoints/product.py index 0e5f6e79..4e8c6682 100644 --- a/apps/pre-processing-service/app/api/endpoints/product.py +++ b/apps/pre-processing-service/app/api/endpoints/product.py @@ -18,7 +18,7 @@ async def search(request: RequestSadaguSearch): """ 상품 검색 엔드포인트 """ - return search_products(request) + return await search_products(request) @router.post("/match", response_model=ResponseSadaguMatch) async def match(request: RequestSadaguMatch): diff --git a/apps/pre-processing-service/app/core/config.py b/apps/pre-processing-service/app/core/config.py index 536e3ddc..a35f048b 100644 --- a/apps/pre-processing-service/app/core/config.py +++ b/apps/pre-processing-service/app/core/config.py @@ -1,9 +1,65 @@ # pydantic_settings에서 SettingsConfigDict를 추가로 import 합니다. from pydantic_settings import BaseSettings, SettingsConfigDict import os +import platform +import subprocess from typing import Optional +def detect_mecab_dicdir() -> Optional[str]: + """MeCab 사전 경로 자동 감지""" + + # 1. mecab-config 명령어로 사전 경로 확인 (가장 정확한 방법) + try: + result = subprocess.run(['mecab-config', '--dicdir'], + capture_output=True, text=True, timeout=5) + if result.returncode == 0: + dicdir = result.stdout.strip() + if os.path.exists(dicdir): + print(f"mecab-config에서 사전 경로 발견: {dicdir}") + return dicdir + except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired): + pass + + # 2. 플랫폼별 일반적인 경로들 확인 + system = platform.system().lower() + + if system == "darwin": # macOS + candidate_paths = [ + "/opt/homebrew/lib/mecab/dic/mecab-ko-dic", # Apple Silicon + "/usr/local/lib/mecab/dic/mecab-ko-dic", # Intel Mac + "/opt/homebrew/lib/mecab/dic/mecab-ipadic", # 기본 사전 + "/usr/local/lib/mecab/dic/mecab-ipadic" + ] + elif system == "linux": + candidate_paths = [ + "/usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ko-dic", + "/usr/lib/mecab/dic/mecab-ko-dic", + "/usr/local/lib/mecab/dic/mecab-ko-dic", + "/usr/share/mecab/dic/mecab-ko-dic", + "/usr/lib/mecab/dic/mecab-ipadic", + "/usr/local/lib/mecab/dic/mecab-ipadic" + ] + elif system == "windows": + candidate_paths = [ + "C:/Program Files/MeCab/dic/mecab-ko-dic", + "C:/mecab/dic/mecab-ko-dic", + "C:/Program Files/MeCab/dic/mecab-ipadic" + ] + else: + candidate_paths = [] + + # 경로 존재 여부 확인 + for path in candidate_paths: + if os.path.exists(path): + # dicrc 파일 존재 확인 (실제 사전인지 검증) + dicrc_path = os.path.join(path, "dicrc") + if os.path.exists(dicrc_path): + print(f"플랫폼 기본 경로에서 사전 발견: {path}") + return path + + return None + # 공통 설정을 위한 BaseSettings class BaseSettingsConfig(BaseSettings): @@ -13,7 +69,19 @@ class BaseSettingsConfig(BaseSettings): db_user: str db_pass: str db_name: str - env_name: str = "dev" + env_name: str = ".dev" + + # MeCab 사전 경로 (자동 감지) + mecab_path: Optional[str] = None + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + # mecab_path가 설정되지 않았으면 자동 감지 + if not self.mecab_path: + self.mecab_path = detect_mecab_dicdir() + if not self.mecab_path: + print("MeCab 사전 경로를 찾을 수 없어 기본 설정으로 실행합니다.") @property def db_url(self) -> str: @@ -25,11 +93,11 @@ def db_url(self) -> str: # 환경별 설정 클래스 class DevSettings(BaseSettingsConfig): - model_config = SettingsConfigDict(env_file=['.env', 'dev.env']) + model_config = SettingsConfigDict(env_file=['.env', '.dev.env']) class PrdSettings(BaseSettingsConfig): - model_config = SettingsConfigDict(env_file=['.env', 'prd.env']) + model_config = SettingsConfigDict(env_file=['.env', '.prd.env']) def get_settings() -> BaseSettingsConfig: """환경 변수에 따라 적절한 설정 객체를 반환하는 함수""" diff --git a/apps/pre-processing-service/app/model/schemas.py b/apps/pre-processing-service/app/model/schemas.py index b811a4e5..5e72fcfb 100644 --- a/apps/pre-processing-service/app/model/schemas.py +++ b/apps/pre-processing-service/app/model/schemas.py @@ -1,40 +1,31 @@ from datetime import datetime -from typing import Optional, List, Dict +from typing import Optional, List, Dict, Union from pydantic import BaseModel, Field, HttpUrl -#기본 요청 +# 기본 요청 class RequestBase(BaseModel): job_id: int schedule_id: int - sschdule_his_id: Optional[int] = None + schedule_his_id: Optional[int] = None -#기본 응답 +# 기본 응답 class ResponseBase(BaseModel): job_id: int schedule_id: int - sschdule_his_id : Optional[int] = None + schedule_his_id: Optional[int] = None status: str - -#네이버 키워드 추출 +# 네이버 키워드 추출 class RequestNaverSearch(RequestBase): tag: str category: Optional[str] = None - start_date : Optional[str] = None - end_date : Optional[str] = None + start_date: Optional[str] = None + end_date: Optional[str] = None class ResponseNaverSearch(ResponseBase): category: Optional[str] = None keyword: str - total_keyword: dict[int, str] - -# #키워드 사다구몰 검증 -# class RequestSadaguValidate(RequestBase): -# tag: str -# category: str -# -# class ResponsetSadaguValidate(ResponseBase): -# keyword: str + total_keyword: Dict[int, str] # 2단계: 검색 class RequestSadaguSearch(RequestBase): @@ -42,43 +33,51 @@ class RequestSadaguSearch(RequestBase): class ResponseSadaguSearch(ResponseBase): keyword: str - search_results: list[dict] + search_results: List[Dict] # 3단계: 매칭 class RequestSadaguMatch(RequestBase): keyword: str - search_results: list[dict] + search_results: List[Dict] class ResponseSadaguMatch(ResponseBase): keyword: str - matched_products: list[dict] + matched_products: List[Dict] # 4단계: 유사도 class RequestSadaguSimilarity(RequestBase): keyword: str - matched_products: list[dict] + matched_products: List[Dict] + search_results: Optional[List[Dict]] = None # 3단계에서 매칭 실패시 폴백용 class ResponseSadaguSimilarity(ResponseBase): keyword: str - selected_product: dict | None = None - reason: str | None = None - -#사다구몰 크롤링 -class RequestSadaguCrawl(RequestBase): + selected_product: Optional[Dict] = None + reason: Optional[str] = None + +# 사다구몰 크롤링 +class RequestSadaguCrawl(BaseModel): + job_id: int = Field(..., description="작업 ID") + schedule_id: int = Field(..., description="스케줄 ID") + schedule_his_id: int = Field(..., description="스케줄 히스토리 ID") tag: str = Field(..., description="크롤링 태그 (예: 'detail')") product_url: HttpUrl = Field(..., description="크롤링할 상품의 URL") use_selenium: bool = Field(default=True, description="Selenium 사용 여부") include_images: bool = Field(default=False, description="이미지 정보 포함 여부") -class ResponseSadaguCrawl(ResponseBase): +class ResponseSadaguCrawl(BaseModel): + job_id: int + schedule_id: int + schedule_his_id: int tag: str product_url: str use_selenium: bool include_images: bool - product_detail: Optional[dict] = None + product_detail: Optional[Dict] = None + status: str crawled_at: Optional[str] = None -#블로그 생성 +# 블로그 생성 class RequestBlogCreate(RequestBase): tag: str category: str @@ -86,10 +85,10 @@ class RequestBlogCreate(RequestBase): class ResponseBlogCreate(ResponseBase): pass -#블로그 배포 +# 블로그 배포 class RequestBlogPublish(RequestBase): tag: str category: str class ResponseBlogPublish(ResponseBase): - pass + pass \ No newline at end of file diff --git a/apps/pre-processing-service/app/service/crawl_service.py b/apps/pre-processing-service/app/service/crawl_service.py index c2a4e13a..11844ead 100644 --- a/apps/pre-processing-service/app/service/crawl_service.py +++ b/apps/pre-processing-service/app/service/crawl_service.py @@ -1,270 +1,49 @@ -import json +# app/service/crawl_service.py import time -import re -import httpx -from bs4 import BeautifulSoup -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.support.ui import WebDriverWait -from selenium.common.exceptions import TimeoutException, NoSuchElementException - +from app.utils.crawler_utils import DetailCrawler from app.errors.CustomException import InvalidItemDataException from app.model.schemas import RequestSadaguCrawl + async def crawl_product_detail(request: RequestSadaguCrawl) -> dict: """ - 선택된 상품의 상세 정보를 크롤링하는 비즈니스 로직입니다. + 선택된 상품의 상세 정보를 크롤링하는 비즈니스 로직입니다. (5단계) 상품 URL을 입력받아 상세 정보를 크롤링하여 딕셔너리로 반환합니다. """ - crawler = ProductDetailCrawler(use_selenium=request.use_selenium) + crawler = DetailCrawler(use_selenium=request.use_selenium) try: + print(f"상품 상세 크롤링 시작: {request.product_url}") + # 상세 정보 크롤링 실행 product_detail = await crawler.crawl_detail( - product_url=str(request.product_url), # HttpUrl을 문자열로 변환 + product_url=str(request.product_url), include_images=request.include_images ) if not product_detail: raise InvalidItemDataException("상품 상세 정보 크롤링 실패") + print(f"크롤링 완료: {product_detail.get('title', 'Unknown')[:50]}") + # 응답 데이터 구성 response_data = { "job_id": request.job_id, "schedule_id": request.schedule_id, + "schedule_his_id": request.schedule_his_id, "tag": request.tag, "product_url": str(request.product_url), "use_selenium": request.use_selenium, "include_images": request.include_images, "product_detail": product_detail, - "status": "success", # "200"에서 "success"로 변경 + "status": "success", "crawled_at": time.strftime('%Y-%m-%d %H:%M:%S') } return response_data except Exception as e: + print(f"크롤링 서비스 오류: {e}") raise InvalidItemDataException(f"상품 상세 크롤링 오류: {e}") finally: - await crawler.close() - - -class ProductDetailCrawler: - def __init__(self, use_selenium=True): - self.base_url = "https://ssadagu.kr" - self.use_selenium = use_selenium - - if use_selenium: - self._setup_selenium() - else: - self._setup_httpx() - - def _setup_selenium(self): - """Selenium WebDriver 초기화""" - chrome_options = Options() - chrome_options.add_argument('--headless') - chrome_options.add_argument('--no-sandbox') - chrome_options.add_argument('--disable-dev-shm-usage') - chrome_options.add_argument('--disable-gpu') - chrome_options.add_argument('--window-size=1920,1080') - chrome_options.add_argument( - '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36') - - try: - self.driver = webdriver.Chrome(options=chrome_options) - self.wait = WebDriverWait(self.driver, 10) - except Exception as e: - print(f"Selenium 초기화 실패, httpx로 대체: {e}") - self.use_selenium = False - self._setup_httpx() - - def _setup_httpx(self): - """httpx 클라이언트 초기화""" - self.client = httpx.AsyncClient( - headers={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' - }, - timeout=30.0 - ) - - async def crawl_detail(self, product_url: str, include_images: bool = False) -> dict: - """상품 상세 정보 크롤링""" - try: - if self.use_selenium: - soup = await self._get_soup_selenium(product_url) - else: - soup = await self._get_soup_httpx(product_url) - - # 기본 정보 추출 - title = self._extract_title(soup) - price = self._extract_price(soup) - rating = self._extract_rating(soup) - options = self._extract_options(soup) - material_info = self._extract_material_info(soup) - - product_data = { - 'url': product_url, - 'title': title, - 'price': price, - 'rating': rating, - 'options': options, - 'material_info': material_info, - 'crawled_at': time.strftime('%Y-%m-%d %H:%M:%S') - } - - # 이미지 정보 추가 (선택적) - if include_images: - product_images = self._extract_images(soup) - product_data['product_images'] = [{'original_url': img_url} for img_url in product_images] - else: - product_data['product_images'] = [] - - return product_data - - except Exception as e: - print(f"크롤링 오류: {e}") - raise InvalidItemDataException(f"크롤링 실패: {str(e)}") - - async def _get_soup_selenium(self, product_url: str) -> BeautifulSoup: - """Selenium으로 HTML 가져오기""" - try: - self.driver.get(product_url) - self.wait.until(lambda driver: driver.execute_script("return document.readyState") == "complete") - return BeautifulSoup(self.driver.page_source, 'html.parser') - except Exception as e: - raise Exception(f"Selenium HTML 로딩 실패: {e}") - - async def _get_soup_httpx(self, product_url: str) -> BeautifulSoup: - """httpx로 HTML 가져오기""" - try: - response = await self.client.get(product_url) - response.raise_for_status() - return BeautifulSoup(response.content, 'html.parser') - except Exception as e: - raise Exception(f"HTTP 요청 실패: {e}") - - def _extract_title(self, soup: BeautifulSoup) -> str: - """제목 추출""" - title_element = soup.find('h1', {'id': 'kakaotitle'}) - return title_element.get_text(strip=True) if title_element else "제목 없음" - - def _extract_price(self, soup: BeautifulSoup) -> int: - """가격 추출""" - price_selectors = [ - 'span.price.gsItemPriceKWR', - '.pdt_price span.price', - 'span.price', - '.price' - ] - - for selector in price_selectors: - price_element = soup.select_one(selector) - if price_element: - price_text = price_element.get_text(strip=True).replace(',', '').replace('원', '') - price_match = re.search(r'(\d+)', price_text) - if price_match: - return int(price_match.group(1)) - return 0 - - def _extract_rating(self, soup: BeautifulSoup) -> float: - """별점 추출""" - rating = 0.0 - star_containers = [ - soup.find('a', class_='start'), - soup.find('div', class_=re.compile(r'star|rating')), - soup.find('a', href='#reviews_wrap') - ] - - for container in star_containers: - if container: - star_imgs = container.find_all('img') - for img in star_imgs: - src = img.get('src', '') - if 'icon_star.svg' in src: - rating += 1 - elif 'icon_star_half.svg' in src: - rating += 0.5 - break - return rating - - def _extract_options(self, soup: BeautifulSoup) -> list[dict]: - """상품 옵션 추출""" - options = [] - sku_list = soup.find('ul', {'id': 'skubox'}) - - if sku_list: - option_items = sku_list.find_all('li', class_=re.compile(r'imgWrapper')) - for item in option_items: - title_element = item.find('a', title=True) - if title_element: - option_name = title_element.get('title', '').strip() - - # 재고 정보 추출 - stock = 0 - item_text = item.get_text() - stock_match = re.search(r'재고\s*:\s*(\d+)', item_text) - if stock_match: - stock = int(stock_match.group(1)) - - # 이미지 URL 추출 - img_element = item.find('img', class_='colorSpec_hashPic') - image_url = "" - if img_element and img_element.get('src'): - image_url = img_element['src'] - - if option_name: - options.append({ - 'name': option_name, - 'stock': stock, - 'image_url': image_url - }) - return options - - def _extract_material_info(self, soup: BeautifulSoup) -> dict: - """소재/재료 정보 추출""" - material_info = {} - info_items = soup.find_all('div', class_='pro-info-item') - - for item in info_items: - title_element = item.find('div', class_='pro-info-title') - info_element = item.find('div', class_='pro-info-info') - - if title_element and info_element: - title = title_element.get_text(strip=True) - info = info_element.get_text(strip=True) - material_info[title] = info - - return material_info - - def _extract_images(self, soup: BeautifulSoup) -> list[str]: - """상품 이미지 URL 추출""" - images = [] - img_elements = soup.find_all('img', {'id': re.compile(r'img_translate_\d+')}) - - for img in img_elements: - src = img.get('src', '') - if src: - if src.startswith('//'): - src = 'https:' + src - elif src.startswith('/'): - src = self.base_url + src - elif src.startswith('http'): - pass - else: - continue - images.append(src) - - return images - - async def close(self): - """리소스 정리""" - if self.use_selenium and hasattr(self, 'driver'): - try: - self.driver.quit() - except Exception: - pass - elif hasattr(self, 'client'): - try: - await self.client.aclose() - except Exception: - pass \ No newline at end of file + await crawler.close() \ No newline at end of file diff --git a/apps/pre-processing-service/app/service/match_service.py b/apps/pre-processing-service/app/service/match_service.py index 5e0926c3..6b1cc171 100644 --- a/apps/pre-processing-service/app/service/match_service.py +++ b/apps/pre-processing-service/app/service/match_service.py @@ -1,21 +1,66 @@ -import urllib - +from app.utils.keyword_matcher import KeywordMatcher +from app.errors.CustomException import InvalidItemDataException from ..model.schemas import RequestSadaguMatch + def match_products(request: RequestSadaguMatch) -> dict: """ - 키워드 매칭 로직 (MeCab 등 사용) + 키워드 매칭 로직 (MeCab 등 사용) - 3단계 """ keyword = request.keyword products = request.search_results - # 키워드 매칭 로직 적용 해야함 - matched = [p for p in products if keyword in p["title"]] + if not products: + return { + "job_id": request.job_id, + "schedule_id": request.schedule_id, + "schedule_his_id": request.schedule_his_id, + "keyword": keyword, + "matched_products": [], + "status": "success" + } + + try: + matcher = KeywordMatcher() + matched_products = [] + + print(f"키워드 '{keyword}'와 {len(products)}개 상품 매칭 분석 시작...") + + for i, product in enumerate(products): + title = product.get('title', '') + if not title: + continue + + # 키워드 매칭 분석 + match_result = matcher.analyze_keyword_match(title, keyword) + + print(f"상품 {i + 1}: {title[:50]} | {match_result['reason']}") + + if match_result['is_match']: + # 매칭된 상품에 매칭 정보 추가 + matched_product = product.copy() + matched_product['match_info'] = { + 'match_type': match_result['match_type'], + 'match_score': match_result['score'], + 'match_reason': match_result['reason'] + } + matched_products.append(matched_product) + print(f" ✅ 매칭됨!") + + print(f"매칭 결과: {len(matched_products)}개 상품") + + # 매칭 스코어 기준으로 정렬 (높은 순) + matched_products.sort(key=lambda x: x['match_info']['match_score'], reverse=True) + + return { + "job_id": request.job_id, + "schedule_id": request.schedule_id, + "schedule_his_id": request.schedule_his_id, + "keyword": keyword, + "matched_products": matched_products, + "status": "success" + } - return { - "job_id": request.job_id, - "schedule_id": request.schedule_id, - "keyword": keyword, - "matched_products": matched, - "status": "success" - } \ No newline at end of file + except Exception as e: + print(f"매칭 서비스 오류: {e}") + raise InvalidItemDataException(f"키워드 매칭 실패: {str(e)}") \ No newline at end of file diff --git a/apps/pre-processing-service/app/service/search_service.py b/apps/pre-processing-service/app/service/search_service.py index 45acb34a..da7aa1fd 100644 --- a/apps/pre-processing-service/app/service/search_service.py +++ b/apps/pre-processing-service/app/service/search_service.py @@ -1,23 +1,81 @@ +from app.utils.crawler_utils import SearchCrawler +from app.errors.CustomException import InvalidItemDataException from ..model.schemas import RequestSadaguSearch -import urllib.parse -def search_products(request: RequestSadaguSearch) -> dict: + +async def search_products(request: RequestSadaguSearch) -> dict: """ - 키워드 기반으로 상품을 검색하는 비즈니스 로직 + 키워드 기반으로 상품을 검색하는 비즈니스 로직 (2단계) """ keyword = request.keyword - encoded_keyword = urllib.parse.quote(keyword) - - # Selenium/requests 로직 추가 해야함 - search_results = [ - {"url": f"https://ssadagu.kr/view.php?id=123"}, - {"url": f"https://ssadagu.kr/view.php?id=456"} - ] - - return { - "job_id": request.job_id, - "schedule_id": request.schedule_id, - "keyword": keyword, - "search_results": search_results, - "status": "success" - } + crawler = SearchCrawler(use_selenium=True) + + try: + print(f"키워드 '{keyword}'로 상품 검색 시작...") + + # Selenium 또는 httpx로 상품 검색 + if crawler.use_selenium: + search_results = await crawler.search_products_selenium(keyword) + else: + search_results = await crawler.search_products_httpx(keyword) + + if not search_results: + print("검색 결과가 없습니다.") + return { + "job_id": request.job_id, + "schedule_id": request.schedule_id, + "schedule_his_id": request.schedule_his_id, + "keyword": keyword, + "search_results": [], + "status": "success" + } + + # 상품별 기본 정보 수집 (제목이 없는 경우 다시 크롤링) + enriched_results = [] + print(f"총 {len(search_results)}개 상품의 기본 정보를 수집 중...") + + for i, product in enumerate(search_results): + try: + # 이미 제목이 있고 유효한 경우 그대로 사용 + if product.get('title') and product['title'] != 'Unknown Title' and len(product['title'].strip()) > 0: + enriched_results.append(product) + else: + # 제목이 없거나 유효하지 않은 경우 다시 크롤링 + print(f"상품 {i + 1}: 제목 재수집 중... ({product['url']})") + basic_info = await crawler.get_basic_product_info(product['url']) + + if basic_info and basic_info['title'] != "제목 없음": + enriched_results.append({ + 'url': product['url'], + 'title': basic_info['title'] + }) + else: + # 그래도 제목을 못 찾으면 제외 + print(f" 제목 추출 실패, 제외") + continue + + # 최대 20개까지만 처리 + if len(enriched_results) >= 20: + break + + except Exception as e: + print(f"상품 {i + 1} 처리 중 오류: {e}") + continue + + print(f"최종 수집된 유효 상품: {len(enriched_results)}개") + + return { + "job_id": request.job_id, + "schedule_id": request.schedule_id, + "schedule_his_id": request.schedule_his_id, + "keyword": keyword, + "search_results": enriched_results, + "status": "success" + } + + except Exception as e: + print(f"검색 서비스 오류: {e}") + raise InvalidItemDataException(f"상품 검색 실패: {str(e)}") + + finally: + await crawler.close() \ No newline at end of file diff --git a/apps/pre-processing-service/app/service/similarity_service.py b/apps/pre-processing-service/app/service/similarity_service.py index 29aa3b20..27823e9e 100644 --- a/apps/pre-processing-service/app/service/similarity_service.py +++ b/apps/pre-processing-service/app/service/similarity_service.py @@ -1,20 +1,137 @@ +from app.utils.similarity_analyzer import SimilarityAnalyzer +from app.errors.CustomException import InvalidItemDataException from ..model.schemas import RequestSadaguSimilarity + def select_product_by_similarity(request: RequestSadaguSimilarity) -> dict: """ - BERT 기반 유사도 분석 후 상품 선택 + BERT 기반 유사도 분석 후 상품 선택 - 4단계 """ keyword = request.keyword candidates = request.matched_products + fallback_products = request.search_results or [] + + # 매칭된 상품이 없으면 전체 검색 결과로 폴백 + if not candidates: + if not fallback_products: + return { + "job_id": request.job_id, + "schedule_id": request.schedule_id, + "schedule_his_id": request.schedule_his_id, + "keyword": keyword, + "selected_product": None, + "reason": "매칭된 상품과 검색 결과가 모두 없음", + "status": "success" + } + + print("매칭된 상품 없음 → 전체 검색 결과에서 유사도 분석") + candidates = fallback_products + analysis_mode = "fallback_similarity_only" + else: + analysis_mode = "matched_products" + + try: + analyzer = SimilarityAnalyzer() + + print(f"키워드 '{keyword}'와 {len(candidates)}개 상품의 유사도 분석 시작... (모드: {analysis_mode})") + + # 한 개만 있으면 바로 선택 + if len(candidates) == 1: + selected_product = candidates[0] + + # 유사도 계산 + similarity = analyzer.calculate_similarity(keyword, selected_product['title']) + + # 폴백 모드에서는 임계값 검증 + if analysis_mode == "fallback_similarity_only": + similarity_threshold = 0.3 + if similarity < similarity_threshold: + return { + "job_id": request.job_id, + "schedule_id": request.schedule_id, + "schedule_his_id": request.schedule_his_id, + "keyword": keyword, + "selected_product": None, + "reason": f"단일 상품 유사도({similarity:.4f}) < 기준({similarity_threshold})", + "status": "success" + } + + selected_product['similarity_info'] = { + 'similarity_score': float(similarity), + 'analysis_type': 'single_candidate', + 'analysis_mode': analysis_mode + } + + return { + "job_id": request.job_id, + "schedule_id": request.schedule_id, + "schedule_his_id": request.schedule_his_id, + "keyword": keyword, + "selected_product": selected_product, + "reason": f"단일 상품 - 유사도: {similarity:.4f} ({analysis_mode})", + "status": "success" + } + + # 여러 개가 있으면 유사도 비교 + print("여러 상품 중 최고 유사도로 선택...") + + # 제목만 추출해서 배치 분석 + titles = [product['title'] for product in candidates] + similarity_results = analyzer.analyze_similarity_batch(keyword, titles) + + # 결과 출력 + for result in similarity_results: + print(f" {result['title'][:40]} | 유사도: {result['similarity']:.4f}") + + # 최고 유사도 선택 + best_result = similarity_results[0] + selected_product = candidates[best_result['index']].copy() + + # 폴백 모드에서는 임계값 검증 + similarity_threshold = 0.3 + if analysis_mode == "fallback_similarity_only" and best_result['similarity'] < similarity_threshold: + return { + "job_id": request.job_id, + "schedule_id": request.schedule_id, + "schedule_his_id": request.schedule_his_id, + "keyword": keyword, + "selected_product": None, + "reason": f"최고 유사도({best_result['similarity']:.4f}) < 기준({similarity_threshold})", + "status": "success" + } + + # 유사도 정보 추가 + selected_product['similarity_info'] = { + 'similarity_score': best_result['similarity'], + 'analysis_type': 'multi_candidate_bert', + 'analysis_mode': analysis_mode, + 'rank': 1, + 'total_candidates': len(candidates) + } + + # 매칭 모드에서는 종합 점수도 계산 + if analysis_mode == "matched_products" and 'match_info' in selected_product: + match_score = selected_product['match_info']['match_score'] + similarity_score = best_result['similarity'] + # 가중치: 매칭 40%, 유사도 60% + final_score = match_score * 0.4 + similarity_score * 0.6 + selected_product['final_score'] = final_score + reason = f"종합점수({final_score:.4f}) = 매칭({match_score:.4f})*0.4 + 유사도({similarity_score:.4f})*0.6" + else: + reason = f"유사도({best_result['similarity']:.4f}) 기준 선택 ({analysis_mode})" + + print(f"선택됨: {selected_product['title'][:50]} | {reason}") + + return { + "job_id": request.job_id, + "schedule_id": request.schedule_id, + "schedule_his_id": request.schedule_his_id, + "keyword": keyword, + "selected_product": selected_product, + "reason": reason, + "status": "success" + } - # 유사도 분석 로직 적용 해야함 - selected = candidates[0] if candidates else None - - return { - "job_id": request.job_id, - "schedule_id": request.schedule_id, - "keyword": keyword, - "selected_product": selected, - "reason": "샘플 로직: 첫 번째 매칭 선택", - "status": "success" - } + except Exception as e: + print(f"유사도 분석 서비스 오류: {e}") + raise InvalidItemDataException(f"유사도 분석 실패: {str(e)}") \ No newline at end of file diff --git a/apps/pre-processing-service/app/test/test_keyword.py b/apps/pre-processing-service/app/test/test_keyword.py index 572cfd41..e0432139 100644 --- a/apps/pre-processing-service/app/test/test_keyword.py +++ b/apps/pre-processing-service/app/test/test_keyword.py @@ -1,6 +1,6 @@ import pytest from fastapi.testclient import TestClient -from ..main import app # main.py에서 FastAPI app 객체를 가져옵니다. +from app.main import app client = TestClient(app) @@ -9,39 +9,36 @@ SCHEDULE_HIS_ID = 1 - def test_read_root(): - # client를 사용하여 API에 요청을 보냅니다. response = client.get("/keyword/") - # HTTP 상태 코드가 200 OK인지 확인합니다. assert response.status_code == 200 - # 응답 본문(JSON)이 예상과 같은지 확인합니다. assert response.json() == {"message": "keyword API"} @pytest.mark.parametrize("tag, category, start_date, end_date", [ - ("naver","50000000","2025-09-01","2025-09-02"), - ("naver","50000001","2025-09-01","2025-09-02"), - ("naver","50000002","2025-09-01","2025-09-02"), - # ("naver","50000002","2025-08-08","2025-08-09"), - ("naver_store","","2025-09-01","2025-09-02"), + ("naver", "50000000", "2025-09-01", "2025-09-02"), + ("naver", "50000001", "2025-09-01", "2025-09-02"), + ("naver", "50000002", "2025-09-01", "2025-09-02"), + ("naver_store", "", "2025-09-01", "2025-09-02"), ]) -def test_search(tag,category, start_date, end_date): - +def test_search(tag, category, start_date, end_date): body = { - "job_id":JOB_ID, + "job_id": JOB_ID, "schedule_id": SCHEDULE_ID, - "sschdule_his_id":SCHEDULE_HIS_ID, - "tag":tag, - "category":category, - "start_date":start_date, - "end_date":end_date + "schedule_his_id": SCHEDULE_HIS_ID, # 오타 수정 + "tag": tag, + "category": category, + "start_date": start_date, + "end_date": end_date } - response = client.post("/keyword/search",json=body) - assert response.json()["job_id"] == body["job_id"] - assert response.json()["schedule_id"] == body["schedule_id"] - assert response.json()["sschdule_his_id"] == body["sschdule_his_id"] - assert response.json()["status"] == "success" - assert "keyword" in response.json() - assert isinstance(response.json()["total_keyword"], dict) - assert response.status_code == 200 \ No newline at end of file + + response = client.post("/keyword/search", json=body) + assert response.status_code == 200 + + response_data = response.json() + assert response_data["job_id"] == body["job_id"] + assert response_data["schedule_id"] == body["schedule_id"] + assert response_data["schedule_his_id"] == body["schedule_his_id"] # 오타 수정 + assert response_data["status"] == "success" + assert "keyword" in response_data + assert isinstance(response_data["total_keyword"], dict) \ No newline at end of file diff --git a/apps/pre-processing-service/app/test/test_match_service.py b/apps/pre-processing-service/app/test/test_match_service.py new file mode 100644 index 00000000..7b80c258 --- /dev/null +++ b/apps/pre-processing-service/app/test/test_match_service.py @@ -0,0 +1,97 @@ +import pytest +from fastapi.testclient import TestClient +from app.main import app + +client = TestClient(app) + + +def test_match_success(): + """키워드 매칭 성공 테스트""" + sample_search_results = [ + { + "url": "https://ssadagu.kr/shop/view.php?platform=1688&num_iid=123", + "title": "925 실버 반지 여성용 결혼반지" + }, + { + "url": "https://ssadagu.kr/shop/view.php?platform=1688&num_iid=456", + "title": "골드 목걸이 체인 펜던트" + }, + { + "url": "https://ssadagu.kr/shop/view.php?platform=1688&num_iid=789", + "title": "반지 세트 커플링 약혼반지" + } + ] + + body = { + "job_id": 1, + "schedule_id": 1, + "schedule_his_id": 1, + "keyword": "반지", + "search_results": sample_search_results + } + + response = client.post("/product/match", json=body) + print(f"Match Response: {response.json()}") + + assert response.status_code == 200 + data = response.json() + assert data["job_id"] == body["job_id"] + assert data["keyword"] == body["keyword"] + assert data["status"] == "success" + assert isinstance(data["matched_products"], list) + + # 반지가 포함된 상품들이 매칭되어야 함 + if data["matched_products"]: + for product in data["matched_products"]: + assert "match_info" in product + assert "match_type" in product["match_info"] + assert "match_score" in product["match_info"] + + +def test_match_no_results(): + """검색 결과가 없는 경우""" + body = { + "job_id": 2, + "schedule_id": 2, + "schedule_his_id": 2, + "keyword": "반지", + "search_results": [] + } + + response = client.post("/product/match", json=body) + print(f"No results response: {response.json()}") + + assert response.status_code == 200 + data = response.json() + assert data["matched_products"] == [] + + +def test_match_no_matches(): + """키워드와 매칭되지 않는 상품들""" + sample_search_results = [ + { + "url": "https://ssadagu.kr/shop/view.php?platform=1688&num_iid=123", + "title": "컴퓨터 키보드 게이밍" + }, + { + "url": "https://ssadagu.kr/shop/view.php?platform=1688&num_iid=456", + "title": "스마트폰 케이스 투명" + } + ] + + body = { + "job_id": 3, + "schedule_id": 3, + "schedule_his_id": 3, + "keyword": "반지", + "search_results": sample_search_results + } + + response = client.post("/product/match", json=body) + print(f"No matches response: {response.json()}") + + assert response.status_code == 200 + data = response.json() + # 매칭되지 않아도 성공으로 처리 + assert data["status"] == "success" + assert isinstance(data["matched_products"], list) \ No newline at end of file diff --git a/apps/pre-processing-service/app/test/test_sadagu_crawl.py b/apps/pre-processing-service/app/test/test_sadagu_crawl.py index 3f336cdf..d034be43 100644 --- a/apps/pre-processing-service/app/test/test_sadagu_crawl.py +++ b/apps/pre-processing-service/app/test/test_sadagu_crawl.py @@ -1,16 +1,14 @@ -# app/test/test_sadagu_crawl.py import pytest from fastapi.testclient import TestClient from app.main import app -from app.errors.CustomException import InvalidItemDataException, ItemNotFoundException client = TestClient(app) def test_crawl_success(): body = { - "job_id": "test-job-001", - "schedule_id": "schedule-001", + "job_id": 1, # 문자열 -> 숫자로 수정 + "schedule_id": 1, # 문자열 -> 숫자로 수정 "schedule_his_id": 1, "tag": "detail", "product_url": "https://ssadagu.kr/shop/view.php?platform=1688&num_iid=886788894790", @@ -19,7 +17,8 @@ def test_crawl_success(): } response = client.post("/product/crawl", json=body) - print(response.json()) + print(f"Response: {response.json()}") + assert response.status_code == 200 data = response.json() assert data["job_id"] == body["job_id"] @@ -29,10 +28,10 @@ def test_crawl_success(): def test_crawl_invalid_url(): - """잘못된 URL이지만 페이지는 존재하는 경우 - 빈 데이터로 성공""" + """잘못된 URL이지만 페이지는 존재하는 경우""" body = { - "job_id": "test-job-002", - "schedule_id": "schedule-002", + "job_id": 2, + "schedule_id": 2, "schedule_his_id": 2, "tag": "detail", "product_url": "https://ssadagu.kr/shop/view.php?platform=1688&num_iid=invalid", @@ -41,25 +40,22 @@ def test_crawl_invalid_url(): } response = client.post("/product/crawl", json=body) - print(response.json()) + print(f"Response: {response.json()}") - # 200으로 성공하지만 유효한 데이터가 없는 경우를 테스트 assert response.status_code == 200 data = response.json() - # 빈 데이터 또는 기본값들을 확인 product_detail = data.get("product_detail", {}) - assert product_detail.get("title") in ["제목 없음", "제목 추출 실패"] - assert product_detail.get("price") == 0 - assert len(product_detail.get("options", [])) == 0 + assert product_detail.get("title") in ["제목 없음", "제목 추출 실패", None] + assert product_detail.get("price", 0) == 0 def test_crawl_completely_invalid_url(): - """완전히 존재하지 않는 도메인 - 실제 오류 발생""" + """완전히 존재하지 않는 도메인""" body = { - "job_id": "test-job-002-invalid", - "schedule_id": "schedule-002-invalid", - "schedule_his_id": 2, + "job_id": 3, + "schedule_id": 3, + "schedule_his_id": 3, "tag": "detail", "product_url": "https://nonexistent-domain-12345.com/invalid", "use_selenium": False, @@ -67,17 +63,16 @@ def test_crawl_completely_invalid_url(): } response = client.post("/product/crawl", json=body) - print(response.json()) + print(f"Response: {response.json()}") - # 이 경우에는 실제로 오류가 발생해야 함 assert response.status_code in (400, 422, 500) def test_crawl_include_images(): body = { - "job_id": "test-job-003", - "schedule_id": "schedule-003", - "schedule_his_id": 3, + "job_id": 4, + "schedule_id": 4, + "schedule_his_id": 4, "tag": "detail", "product_url": "https://ssadagu.kr/shop/view.php?platform=1688&num_iid=886788894790", "use_selenium": False, @@ -85,7 +80,8 @@ def test_crawl_include_images(): } response = client.post("/product/crawl", json=body) - print(response.json()) + print(f"Response: {response.json()}") + assert response.status_code == 200 data = response.json() assert data["include_images"] is True diff --git a/apps/pre-processing-service/app/test/test_search_service.py b/apps/pre-processing-service/app/test/test_search_service.py new file mode 100644 index 00000000..6dd415e0 --- /dev/null +++ b/apps/pre-processing-service/app/test/test_search_service.py @@ -0,0 +1,62 @@ +import pytest +from fastapi.testclient import TestClient +from app.main import app + +client = TestClient(app) + + +def test_search_success(): + """상품 검색 성공 테스트""" + body = { + "job_id": 1, + "schedule_id": 1, + "schedule_his_id": 1, + "keyword": "반지" + } + + response = client.post("/product/search", json=body) + print(f"Search Response: {response.json()}") + + assert response.status_code == 200 + data = response.json() + assert data["job_id"] == body["job_id"] + assert data["keyword"] == body["keyword"] + assert data["status"] == "success" + assert isinstance(data["search_results"], list) + + +def test_search_empty_keyword(): + """빈 키워드 검색 테스트""" + body = { + "job_id": 2, + "schedule_id": 2, + "schedule_his_id": 2, + "keyword": "" + } + + response = client.post("/product/search", json=body) + print(f"Empty keyword response: {response.json()}") + + # 빈 키워드라도 에러가 아닌 빈 결과를 반환해야 함 + assert response.status_code == 200 + data = response.json() + assert data["search_results"] == [] + + +def test_search_nonexistent_keyword(): + """존재하지 않는 키워드 검색""" + body = { + "job_id": 3, + "schedule_id": 3, + "schedule_his_id": 3, + "keyword": "zxcvbnmasdfghjklqwertyuiop123456789" + } + + response = client.post("/product/search", json=body) + print(f"Nonexistent keyword response: {response.json()}") + + assert response.status_code == 200 + data = response.json() + # 검색 결과가 없어도 성공으로 처리 + assert data["status"] == "success" + assert isinstance(data["search_results"], list) \ No newline at end of file diff --git a/apps/pre-processing-service/app/test/test_similarity_service.py b/apps/pre-processing-service/app/test/test_similarity_service.py new file mode 100644 index 00000000..1888b873 --- /dev/null +++ b/apps/pre-processing-service/app/test/test_similarity_service.py @@ -0,0 +1,136 @@ +import pytest +from fastapi.testclient import TestClient +from app.main import app + +client = TestClient(app) + + +def test_similarity_with_matched_products(): + """매칭된 상품들 중에서 유사도 분석""" + matched_products = [ + { + "url": "https://ssadagu.kr/shop/view.php?platform=1688&num_iid=123", + "title": "925 실버 반지 여성용", + "match_info": { + "match_type": "exact", + "match_score": 1.0, + "match_reason": "완전 매칭" + } + }, + { + "url": "https://ssadagu.kr/shop/view.php?platform=1688&num_iid=456", + "title": "반지 세트 커플링", + "match_info": { + "match_type": "morphological", + "match_score": 0.8, + "match_reason": "형태소 매칭" + } + } + ] + + body = { + "job_id": 1, + "schedule_id": 1, + "schedule_his_id": 1, + "keyword": "반지", + "matched_products": matched_products + } + + response = client.post("/product/similarity", json=body) + print(f"Similarity Response: {response.json()}") + + assert response.status_code == 200 + data = response.json() + assert data["job_id"] == body["job_id"] + assert data["keyword"] == body["keyword"] + assert data["status"] == "success" + + if data["selected_product"]: + assert "similarity_info" in data["selected_product"] + assert "similarity_score" in data["selected_product"]["similarity_info"] + assert data["reason"] is not None + + +def test_similarity_fallback_to_search_results(): + """매칭 실패시 전체 검색 결과에서 유사도 분석""" + search_results = [ + { + "url": "https://ssadagu.kr/shop/view.php?platform=1688&num_iid=123", + "title": "실버 링 악세서리" + }, + { + "url": "https://ssadagu.kr/shop/view.php?platform=1688&num_iid=456", + "title": "골드 반지 여성" + } + ] + + body = { + "job_id": 2, + "schedule_id": 2, + "schedule_his_id": 2, + "keyword": "반지", + "matched_products": [], # 매칭된 상품 없음 + "search_results": search_results # 폴백용 + } + + response = client.post("/product/similarity", json=body) + print(f"Fallback Response: {response.json()}") + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "success" + + # 폴백 모드에서는 임계값을 통과한 경우에만 상품이 선택됨 + if data["selected_product"]: + assert "similarity_info" in data["selected_product"] + assert data["selected_product"]["similarity_info"]["analysis_mode"] == "fallback_similarity_only" + + +def test_similarity_single_candidate(): + """후보가 1개만 있는 경우""" + single_product = [ + { + "url": "https://ssadagu.kr/shop/view.php?platform=1688&num_iid=123", + "title": "925 실버 반지 여성용", + "match_info": { + "match_type": "exact", + "match_score": 1.0 + } + } + ] + + body = { + "job_id": 3, + "schedule_id": 3, + "schedule_his_id": 3, + "keyword": "반지", + "matched_products": single_product + } + + response = client.post("/product/similarity", json=body) + print(f"Single candidate response: {response.json()}") + + assert response.status_code == 200 + data = response.json() + assert data["selected_product"] is not None + assert data["selected_product"]["similarity_info"]["analysis_type"] == "single_candidate" + + +def test_similarity_no_candidates(): + """후보가 없는 경우""" + body = { + "job_id": 4, + "schedule_id": 4, + "schedule_his_id": 4, + "keyword": "반지", + "matched_products": [], + "search_results": [] + } + + response = client.post("/product/similarity", json=body) + print(f"No candidates response: {response.json()}") + + assert response.status_code == 200 + data = response.json() + assert data["selected_product"] is None + assert "검색 결과가 모두 없음" in data["reason"] \ No newline at end of file diff --git a/apps/pre-processing-service/app/utils/__init__.py b/apps/pre-processing-service/app/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/apps/pre-processing-service/app/utils/crawler_utils.py b/apps/pre-processing-service/app/utils/crawler_utils.py new file mode 100644 index 00000000..8246788a --- /dev/null +++ b/apps/pre-processing-service/app/utils/crawler_utils.py @@ -0,0 +1,340 @@ +import urllib.parse +import httpx +import re +import time +from bs4 import BeautifulSoup +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.common.exceptions import TimeoutException, NoSuchElementException + + +class SearchCrawler: + def __init__(self, use_selenium=True): + self.base_url = "https://ssadagu.kr" + self.use_selenium = use_selenium + + if use_selenium: + self._setup_selenium() + else: + self._setup_httpx() + + def _setup_selenium(self): + """Selenium WebDriver 초기화""" + chrome_options = Options() + chrome_options.add_argument('--headless') + chrome_options.add_argument('--no-sandbox') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--disable-gpu') + chrome_options.add_argument('--window-size=1920,1080') + chrome_options.add_argument( + '--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + ) + + try: + self.driver = webdriver.Chrome(options=chrome_options) + self.wait = WebDriverWait(self.driver, 10) + print("Selenium WebDriver 초기화 완료") + except Exception as e: + print(f"Selenium 초기화 실패, httpx로 대체: {e}") + self.use_selenium = False + self._setup_httpx() + + def _setup_httpx(self): + """httpx 클라이언트 초기화""" + self.client = httpx.AsyncClient( + headers={ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + }, + timeout=30.0 + ) + + async def search_products_selenium(self, keyword: str) -> list[dict]: + """Selenium을 사용한 상품 검색""" + encoded_keyword = urllib.parse.quote(keyword) + search_url = f"{self.base_url}/shop/search.php?ss_tx={encoded_keyword}" + + try: + self.driver.get(search_url) + time.sleep(5) + + product_links = [] + link_elements = self.driver.find_elements(By.TAG_NAME, "a") + + for element in link_elements: + href = element.get_attribute('href') + if href and 'view.php' in href and ('platform=1688' in href or 'num_iid' in href): + try: + title = element.get_attribute('title') or element.text.strip() + if title: + product_links.append({ + 'url': href, + 'title': title + }) + except: + product_links.append({ + 'url': href, + 'title': 'Unknown Title' + }) + + # 중복 제거 + seen_urls = set() + unique_products = [] + for product in product_links: + if product['url'] not in seen_urls: + seen_urls.add(product['url']) + unique_products.append(product) + + print(f"Selenium으로 발견한 상품 링크: {len(unique_products)}개") + return unique_products[:20] + + except Exception as e: + print(f"Selenium 검색 오류: {e}") + return [] + + async def search_products_httpx(self, keyword: str) -> list[dict]: + """httpx를 사용한 상품 검색""" + encoded_keyword = urllib.parse.quote(keyword) + search_url = f"{self.base_url}/shop/search.php?ss_tx={encoded_keyword}" + + try: + response = await self.client.get(search_url) + response.raise_for_status() + soup = BeautifulSoup(response.content, 'html.parser') + + product_links = [] + all_links = soup.find_all('a', href=True) + + for link in all_links: + href = link['href'] + if 'view.php' in href and ('platform=1688' in href or 'num_iid' in href): + full_url = f"{self.base_url}{href}" if href.startswith('/') else href + title = link.get('title', '') or link.get_text(strip=True) or 'Unknown Title' + + product_links.append({ + 'url': full_url, + 'title': title + }) + + print(f"httpx로 발견한 상품 링크: {len(product_links)}개") + return product_links[:20] + + except Exception as e: + print(f"httpx 검색 오류: {e}") + return [] + + async def get_basic_product_info(self, product_url: str) -> dict: + """기본 상품 정보만 크롤링""" + try: + if self.use_selenium: + self.driver.get(product_url) + self.wait.until(lambda driver: driver.execute_script("return document.readyState") == "complete") + soup = BeautifulSoup(self.driver.page_source, 'html.parser') + else: + response = await self.client.get(product_url) + response.raise_for_status() + soup = BeautifulSoup(response.content, 'html.parser') + + title_element = soup.find('h1', {'id': 'kakaotitle'}) + title = title_element.get_text(strip=True) if title_element else "제목 없음" + + return { + 'url': product_url, + 'title': title + } + + except Exception as e: + print(f"기본 상품 크롤링 오류 ({product_url}): {e}") + return None + + async def close(self): + """리소스 정리""" + if self.use_selenium and hasattr(self, 'driver'): + try: + self.driver.quit() + except Exception: + pass + elif hasattr(self, 'client'): + try: + await self.client.aclose() + except Exception: + pass + + +class DetailCrawler(SearchCrawler): + """SearchCrawler를 확장한 상세 크롤링 클래스""" + + async def crawl_detail(self, product_url: str, include_images: bool = False) -> dict: + """상품 상세 정보 크롤링""" + try: + if self.use_selenium: + soup = await self._get_soup_selenium(product_url) + else: + soup = await self._get_soup_httpx(product_url) + + # 기본 정보 추출 + title = self._extract_title(soup) + price = self._extract_price(soup) + rating = self._extract_rating(soup) + options = self._extract_options(soup) + material_info = self._extract_material_info(soup) + + product_data = { + 'url': product_url, + 'title': title, + 'price': price, + 'rating': rating, + 'options': options, + 'material_info': material_info, + 'crawled_at': time.strftime('%Y-%m-%d %H:%M:%S') + } + + if include_images: + print("이미지 정보 추출 중...") + product_images = self._extract_images(soup) + product_data['product_images'] = [{'original_url': img_url} for img_url in product_images] + print(f"추출된 이미지: {len(product_images)}개") + else: + product_data['product_images'] = [] + + return product_data + + except Exception as e: + print(f"크롤링 오류: {e}") + raise Exception(f"크롤링 실패: {str(e)}") + + async def _get_soup_selenium(self, product_url: str) -> BeautifulSoup: + """Selenium으로 HTML 가져오기""" + try: + self.driver.get(product_url) + self.wait.until(lambda driver: driver.execute_script("return document.readyState") == "complete") + time.sleep(2) + return BeautifulSoup(self.driver.page_source, 'html.parser') + except Exception as e: + raise Exception(f"Selenium HTML 로딩 실패: {e}") + + async def _get_soup_httpx(self, product_url: str) -> BeautifulSoup: + """httpx로 HTML 가져오기""" + try: + response = await self.client.get(product_url) + response.raise_for_status() + return BeautifulSoup(response.content, 'html.parser') + except Exception as e: + raise Exception(f"HTTP 요청 실패: {e}") + + def _extract_title(self, soup: BeautifulSoup) -> str: + """제목 추출""" + title_element = soup.find('h1', {'id': 'kakaotitle'}) + return title_element.get_text(strip=True) if title_element else "제목 없음" + + def _extract_price(self, soup: BeautifulSoup) -> int: + """가격 추출""" + price = 0 + price_selectors = [ + 'span.price.gsItemPriceKWR', + '.pdt_price span.price', + 'span.price', + '.price' + ] + + for selector in price_selectors: + price_element = soup.select_one(selector) + if price_element: + price_text = price_element.get_text(strip=True).replace(',', '').replace('원', '') + price_match = re.search(r'(\d+)', price_text) + if price_match: + price = int(price_match.group(1)) + break + return price + + def _extract_rating(self, soup: BeautifulSoup) -> float: + """평점 추출""" + rating = 0.0 + star_containers = [ + soup.find('a', class_='start'), + soup.find('div', class_=re.compile(r'star|rating')), + soup.find('a', href='#reviews_wrap') + ] + + for container in star_containers: + if container: + star_imgs = container.find_all('img') + for img in star_imgs: + src = img.get('src', '') + if 'icon_star.svg' in src: + rating += 1 + elif 'icon_star_half.svg' in src: + rating += 0.5 + break + return rating + + def _extract_options(self, soup: BeautifulSoup) -> list[dict]: + """상품 옵션 추출""" + options = [] + sku_list = soup.find('ul', {'id': 'skubox'}) + + if sku_list: + option_items = sku_list.find_all('li', class_=re.compile(r'imgWrapper')) + for item in option_items: + title_element = item.find('a', title=True) + if title_element: + option_name = title_element.get('title', '').strip() + + # 재고 정보 추출 + stock = 0 + item_text = item.get_text() + stock_match = re.search(r'재고\s*:\s*(\d+)', item_text) + if stock_match: + stock = int(stock_match.group(1)) + + # 이미지 URL 추출 + img_element = item.find('img', class_='colorSpec_hashPic') + image_url = "" + if img_element and img_element.get('src'): + image_url = img_element['src'] + + if option_name: + options.append({ + 'name': option_name, + 'stock': stock, + 'image_url': image_url + }) + + return options + + def _extract_material_info(self, soup: BeautifulSoup) -> dict: + """소재 정보 추출""" + material_info = {} + info_items = soup.find_all('div', class_='pro-info-item') + + for item in info_items: + title_element = item.find('div', class_='pro-info-title') + info_element = item.find('div', class_='pro-info-info') + + if title_element and info_element: + title = title_element.get_text(strip=True) + info = info_element.get_text(strip=True) + material_info[title] = info + + return material_info + + def _extract_images(self, soup: BeautifulSoup) -> list[str]: + """상품 이미지 추출""" + images = [] + img_elements = soup.find_all('img', {'id': re.compile(r'img_translate_\d+')}) + + for img in img_elements: + src = img.get('src', '') + if src: + if src.startswith('//'): + src = 'https:' + src + elif src.startswith('/'): + src = self.base_url + src + elif src.startswith('http'): + pass + else: + continue + images.append(src) + + return images \ No newline at end of file diff --git a/apps/pre-processing-service/app/utils/keyword_matcher.py b/apps/pre-processing-service/app/utils/keyword_matcher.py new file mode 100644 index 00000000..8fab2730 --- /dev/null +++ b/apps/pre-processing-service/app/utils/keyword_matcher.py @@ -0,0 +1,148 @@ +from app.core.config import settings # pydantic_settings 기반 + +try: + import MeCab + + print("MeCab 라이브러리 로딩 성공") + MECAB_AVAILABLE = True +except ImportError: + print("MeCab 라이브러리를 찾을 수 없습니다. pip install mecab-python3 를 실행해주세요.") + MeCab = None + MECAB_AVAILABLE = False + + +class KeywordMatcher: + """키워드 매칭 분석기""" + + def __init__(self): + self.konlpy_available = False + + # MeCab 사용 가능 여부 확인 + if MECAB_AVAILABLE: + try: + # 경로가 있으면 사용, 없으면 기본값 + if settings.mecab_path: + self.mecab = MeCab.Tagger(f"-d {settings.mecab_path}") + else: + self.mecab = MeCab.Tagger() # 기본 경로 + + # 테스트 실행 + test_result = self.mecab.parse("테스트") + if test_result and test_result.strip(): + self.konlpy_available = True + print(f"MeCab 형태소 분석기 사용 가능 (경로: {settings.mecab_path or '기본'})") + else: + print("MeCab 테스트 실패") + except Exception as e: + print(f"MeCab 사용 불가 (규칙 기반으로 대체): {e}") + else: + print("MeCab 라이브러리가 설치되지 않았습니다. 규칙 기반으로 대체합니다.") + + def analyze_keyword_match(self, title: str, keyword: str) -> dict: + """키워드 매칭 분석 결과 반환""" + title_lower = title.lower().strip() + keyword_lower = keyword.lower().strip() + + # 1. 완전 포함 검사 + exact_match = keyword_lower in title_lower + if exact_match: + return { + 'is_match': True, + 'match_type': 'exact', + 'score': 1.0, + 'reason': f"완전 포함: '{keyword}' in '{title[:50]}'" + } + + # 2. 형태소 분석 (MeCab 사용) + if self.konlpy_available: + morphological_result = self._morphological_match(title_lower, keyword_lower) + if morphological_result['is_match']: + return morphological_result + + # 3. 규칙 기반 분석 (MeCab 실패시) + simple_result = self._simple_keyword_match(title_lower, keyword_lower) + return simple_result + + def _morphological_match(self, title: str, keyword: str) -> dict: + """형태소 분석 기반 매칭""" + try: + # 키워드 형태소 분석 + keyword_result = self.mecab.parse(keyword) + keyword_morphs = [] + for line in keyword_result.split('\n'): + if line == 'EOS' or line == '': + continue + parts = line.split('\t') + if len(parts) >= 1: + morph = parts[0].strip() + if len(morph) >= 1: + keyword_morphs.append(morph) + + # 제목 형태소 분석 + title_result = self.mecab.parse(title) + title_morphs = [] + for line in title_result.split('\n'): + if line == 'EOS' or line == '': + continue + parts = line.split('\t') + if len(parts) >= 1: + morph = parts[0].strip() + if len(morph) >= 1: + title_morphs.append(morph) + + # 형태소 매칭 + matched = 0 + for kw in keyword_morphs: + if len(kw) >= 2: # 의미있는 형태소만 검사 + for tw in title_morphs: + if kw == tw or kw in tw or tw in kw: + matched += 1 + break + + match_ratio = matched / len(keyword_morphs) if keyword_morphs else 0 + threshold = 0.4 + + if match_ratio >= threshold: + return { + 'is_match': True, + 'match_type': 'morphological', + 'score': match_ratio, + 'reason': f"형태소 매칭: {matched}/{len(keyword_morphs)} = {match_ratio:.3f}" + } + + except Exception as e: + print(f"형태소 분석 오류: {e}") + + return {'is_match': False, 'match_type': 'morphological', 'score': 0.0, 'reason': '형태소 분석 실패'} + + def _simple_keyword_match(self, title: str, keyword: str) -> dict: + """간단한 키워드 매칭""" + # 공백으로 분리 + title_words = title.split() + keyword_words = keyword.split() + + matched = 0 + for kw in keyword_words: + if len(kw) >= 2: + for tw in title_words: + if kw in tw or tw in kw: + matched += 1 + break + + match_ratio = matched / len(keyword_words) if keyword_words else 0 + threshold = 0.3 + + if match_ratio >= threshold: + return { + 'is_match': True, + 'match_type': 'simple', + 'score': match_ratio, + 'reason': f"규칙 기반 매칭: {matched}/{len(keyword_words)} = {match_ratio:.3f}" + } + + return { + 'is_match': False, + 'match_type': 'simple', + 'score': match_ratio, + 'reason': f"규칙 기반 미달: {matched}/{len(keyword_words)} = {match_ratio:.3f} < {threshold}" + } \ No newline at end of file diff --git a/apps/pre-processing-service/app/utils/similarity_analyzer.py b/apps/pre-processing-service/app/utils/similarity_analyzer.py new file mode 100644 index 00000000..d155ee2e --- /dev/null +++ b/apps/pre-processing-service/app/utils/similarity_analyzer.py @@ -0,0 +1,65 @@ +import torch +import numpy as np +from sklearn.metrics.pairwise import cosine_similarity +from transformers import AutoTokenizer, AutoModel + + +class SimilarityAnalyzer: + """텍스트 유사도 분석기""" + + def __init__(self): + try: + self.tokenizer = AutoTokenizer.from_pretrained('klue/bert-base') + self.model = AutoModel.from_pretrained('klue/bert-base') + print("KLUE BERT 모델 로딩 성공") + except Exception as e: + print(f"KLUE BERT 로딩 실패, 다국어 BERT로 대체: {e}") + try: + self.tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased') + self.model = AutoModel.from_pretrained('bert-base-multilingual-cased') + print("다국어 BERT 모델 로딩 성공") + except Exception as e2: + print(f"모든 BERT 모델 로딩 실패: {e2}") + raise e2 + + def get_embedding(self, text: str) -> np.ndarray: + """텍스트 임베딩 생성""" + inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128) + with torch.no_grad(): + outputs = self.model(**inputs) + return outputs.last_hidden_state[:, 0, :].numpy() + + def calculate_similarity(self, text1: str, text2: str) -> float: + """두 텍스트 간 유사도 계산""" + embedding1 = self.get_embedding(text1) + embedding2 = self.get_embedding(text2) + return cosine_similarity(embedding1, embedding2)[0][0] + + def analyze_similarity_batch(self, keyword: str, product_titles: list[str]) -> list[dict]: + """배치로 유사도 분석""" + keyword_embedding = self.get_embedding(keyword) + results = [] + + for i, title in enumerate(product_titles): + try: + title_embedding = self.get_embedding(title) + similarity = cosine_similarity(keyword_embedding, title_embedding)[0][0] + + results.append({ + 'index': i, + 'title': title, + 'similarity': float(similarity), + 'score': float(similarity) + }) + except Exception as e: + print(f"유사도 계산 오류 (제목: {title[:30]}): {e}") + results.append({ + 'index': i, + 'title': title, + 'similarity': 0.0, + 'score': 0.0 + }) + + # 유사도 기준 내림차순 정렬 + results.sort(key=lambda x: x['similarity'], reverse=True) + return results \ No newline at end of file diff --git a/apps/pre-processing-service/pyproject.toml b/apps/pre-processing-service/pyproject.toml index 35b2e563..af7d2124 100644 --- a/apps/pre-processing-service/pyproject.toml +++ b/apps/pre-processing-service/pyproject.toml @@ -26,12 +26,9 @@ dependencies = [ "scikit-learn (>=1.7.1,<2.0.0)", "python-dotenv (>=1.1.1,<2.0.0)", "mecab-python3 (>=1.0.10,<2.0.0)", - "python-mecab-ko (>=1.3.7,<2.0.0)", - "python-mecab-ko-dic (>=2.1.1.post2,<3.0.0)", "httpx (>=0.28.1,<0.29.0)", "asyncpg (>=0.30.0,<0.31.0)", "gunicorn (>=23.0.0,<24.0.0)", - "httpx (>=0.28.1,<0.29.0)" ]