Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
name: Deploy AI Server (FastAPI)

on:
push:
branches: [ "dev" ]
pull_request:
types: [ closed ]
branches: [ main ]

jobs:
deploy-ai-server:
runs-on: ubuntu-latest
if: github.event.pull_request.merged == true

steps:
- name: ✅ Checkout FastAPI 코드
Expand All @@ -28,5 +30,5 @@ jobs:
key: ${{ secrets.EC2_KEY }}
script: |
cd ~/my-app
docker-compose pull ai-server
docker-compose up -d ai-server
docker compose pull ai-server
docker compose up -d ai-server
11 changes: 11 additions & 0 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from fastapi import FastAPI
from routes import skt_classify

app = FastAPI(title="SKT KoBERT Text Classification API with URL Extraction")

# API 라우터 등록
app.include_router(skt_classify.router, prefix="/api", tags=["SKT KoBERT Classification"])

@app.get("/")
def root():
return {"message": "FastAPI SKT KoBERT 문장 분류 API (URL 추출 포함)"}
15 changes: 15 additions & 0 deletions app/models/kobert_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer

# KO-BERT 모델 로드
model_name = "monologg/kobert"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name)

# 문장 임베딩 생성 함수
def get_sentence_embedding(text: str):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state[:, 0, :].squeeze().numpy()
19 changes: 19 additions & 0 deletions app/models/skt_kobert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("skt/kobert-base-v1")
model = AutoModel.from_pretrained("skt/kobert-base-v1")

def get_sentence_embedding(text):
inputs = tokenizer(
text,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
inputs.pop("token_type_ids", None)

with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state[:, 0, :].squeeze().numpy()
20 changes: 20 additions & 0 deletions app/models/url_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import re

def separate_text_and_urls(text):
# URL 정규 표현식 패턴 (도메인과 경로 부분만 추출)
url_pattern = re.compile(r'https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}(/\S*)?')

# URL 추출 (finditer 사용)
urls = [match.group() for match in url_pattern.finditer(text)]

# URL을 제외한 일반 텍스트 추출
text_without_urls = url_pattern.sub('', text).strip()

return text_without_urls, urls

# 테스트
sample_text = "이 영화는 너무 재미있어요! 예매는http://ticket.com 에서 할 수 있어요."
text, urls = separate_text_and_urls(sample_text)

print("📌 일반 텍스트:", text)
print("🔗 추출된 URL:", urls)
16 changes: 16 additions & 0 deletions app/routes/classify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from fastapi import APIRouter
from pydantic import BaseModel
from typing import List
from app.services.text_processing import classify_or_create_category, category_embeddings

router = APIRouter()

# 요청 데이터 형식 정의
class SentenceRequest(BaseModel):
sentences: List[str] # 문자열 리스트 받기

# 문자열 입력 후 문장 자동 분류 API
@router.post("/classify")
async def classify_text(sentences: SentenceRequest):
categorized_results = classify_or_create_category(sentences.sentences)
return {"categorized_sentences": categorized_results, "category_list": list(category_embeddings.keys())}
13 changes: 13 additions & 0 deletions app/routes/skt_classify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from fastapi import APIRouter
from pydantic import BaseModel
from app.services.skt_text_processing import classify_paragraph

router = APIRouter()

class ParagraphRequest(BaseModel):
paragraph: str # 단일 문단 받기

@router.post("/classify_paragraph")
async def classify_paragraph_api(request: ParagraphRequest):
result = classify_paragraph(request.paragraph)
return result
5 changes: 5 additions & 0 deletions app/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import uvicorn

if __name__ == "__main__":
# 서버 구동 이후 디버깅을 위해 reload=True 옵션을 추가한다
uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)
84 changes: 84 additions & 0 deletions app/services/skt_text_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import re
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict
import numpy as np
from app.models.skt_kobert import get_sentence_embedding


CATEGORY_LABELS = {
"여행": ["도쿄 여행", "유럽 배낭 여행", "국내 캠핑", "비행기 예약", "숙소 추천"],
"교통": ["지하철 환승", "고속열차 이용", "비행기 탑승", "대중교통 이용", "렌터카 예약"],
"쇼핑": ["면세점 할인", "백화점 세일", "전자제품 구매", "패션 브랜드 쇼핑", "기념품 구매"],
"음식": ["일본 라멘 맛집", "프랑스 빵 추천", "한식당 방문", "카페 탐방", "길거리 음식"],
"기타": ["문화 체험", "미술관 방문", "박물관 견학", "자연 탐방", "테마파크 방문"]
}

url_pattern = re.compile(r'https?://[a-zA-Z0-9./?=&_%:-]+')

# ✅ 문장에서 URL을 추출하고 문장 내에서 분리하는 함수
def extract_urls_from_sentences(sentence: str):
urls = url_pattern.findall(sentence)
text_without_urls = url_pattern.sub(' ', sentence).strip()
text_without_urls = re.sub(r'\s+', ' ', text_without_urls)
return text_without_urls, urls

category_embeddings = {
category: np.mean([get_sentence_embedding(example) for example in examples], axis=0)
for category, examples in CATEGORY_LABELS.items()
}

def generate_new_category(text: str):
words = text.split()
for word in words:
if len(word) > 1:
return word
return "기타"

def classify_paragraph(paragraph: str, threshold: float = 0.7):
global category_embeddings
sentences = paragraph.split("\n")
processed_sentences = []
paragraph_embedding = get_sentence_embedding(paragraph)


best_category = None
best_similarity = 0

for category, category_vector in category_embeddings.items():
similarity = cosine_similarity([paragraph_embedding], [category_vector])[0][0]
if similarity > best_similarity:
best_similarity = similarity
best_category = category

# ✅ 기존 카테고리에 없으면 "category": "no", 추천 카테고리 제공
if best_similarity < threshold:
recommend_category = generate_new_category(paragraph)
if recommend_category not in category_embeddings:
category_embeddings[recommend_category] = paragraph_embedding
return_category = "no"
else:
recommend_category = best_category
return_category = best_category


for sentence in sentences:
text_part, urls = extract_urls_from_sentences(sentence)
sentence_embedding = get_sentence_embedding(text_part) if text_part else None


if urls:
sub_category = "관련 링크"
else:
sub_category = recommend_category

processed_sentences.append({
"text": text_part if text_part else "URL 포함 문장",
"sub_category": sub_category,
"urls": urls if urls else None
})

return {
"category": return_category,
"recommend_category": recommend_category,
"sentences": processed_sentences
}
36 changes: 36 additions & 0 deletions app/services/text_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict
import numpy as np
from app.models.kobert_classifier import get_sentence_embedding

# 기존 카테고리 저장소 (초기 없음)
category_embeddings: Dict[str, List[np.ndarray]] = {}

# 문장 분류 함수 (유사 카테고리 확인 및 생성)
def classify_or_create_category(sentences: List[str], threshold: float = 0.7):
global category_embeddings
categorized_sentences = {}

for sentence in sentences:
sentence_embedding = get_sentence_embedding(sentence)

best_category = None
best_similarity = 0

# 기존 카테고리와 유사도 비교
for category, embeddings in category_embeddings.items():
similarities = cosine_similarity([sentence_embedding], embeddings).mean()
if similarities > best_similarity:
best_similarity = similarities
best_category = category

# 기존 카테고리 존재 시 추가, 없으면 새 카테고리 생성
if best_similarity >= threshold and best_category:
categorized_sentences.setdefault(best_category, []).append(sentence)
category_embeddings[best_category].append(sentence_embedding)
else:
new_category = f"Category_{len(category_embeddings) + 1}"
category_embeddings[new_category] = [sentence_embedding]
categorized_sentences.setdefault(new_category, []).append(sentence)

return categorized_sentences
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
fastapi
uvicorn
torch
transformers
numpy
sentencepiece
protobuf
scikit-learn
2 changes: 2 additions & 0 deletions test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
test
test11