Skip to content

Commit 51aba53

Browse files
authored
feat : dev 파일들 배포에 추가 #1 (#4)
* fix : requirements.txt 수정 * feat : CI/CD 설정 및 프로토타입 세팅 (#2) * feat : 프로토타입 추가 #1 * feat : 프로토타입 추가 및 환경 수정 #1 * feat : 테스트용 deploy 수정 #1 * feat : test #1 * feat : test22 #1 * feat : deploy 설정변경 #1 * fix : deploy 설정 변경 #1 * style : 주석 제거 #1 * style : 주석 제거 #1 * style : 주석 추가 제거 #1
1 parent 1d4623a commit 51aba53

File tree

12 files changed

+231
-4
lines changed

12 files changed

+231
-4
lines changed

.github/workflows/deploy.yml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
name: Deploy AI Server (FastAPI)
22

33
on:
4-
push:
5-
branches: [ "dev" ]
4+
pull_request:
5+
types: [ closed ]
6+
branches: [ main ]
67

78
jobs:
89
deploy-ai-server:
910
runs-on: ubuntu-latest
11+
if: github.event.pull_request.merged == true
1012

1113
steps:
1214
- name: ✅ Checkout FastAPI 코드
@@ -28,5 +30,5 @@ jobs:
2830
key: ${{ secrets.EC2_KEY }}
2931
script: |
3032
cd ~/my-app
31-
docker-compose pull ai-server
32-
docker-compose up -d ai-server
33+
docker compose pull ai-server
34+
docker compose up -d ai-server

app/main.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from fastapi import FastAPI
2+
from routes import skt_classify
3+
4+
app = FastAPI(title="SKT KoBERT Text Classification API with URL Extraction")
5+
6+
# API 라우터 등록
7+
app.include_router(skt_classify.router, prefix="/api", tags=["SKT KoBERT Classification"])
8+
9+
@app.get("/")
10+
def root():
11+
return {"message": "FastAPI SKT KoBERT 문장 분류 API (URL 추출 포함)"}

app/models/kobert_classifier.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import torch
2+
import numpy as np
3+
from transformers import AutoModel, AutoTokenizer
4+
5+
# KO-BERT 모델 로드
6+
model_name = "monologg/kobert"
7+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
8+
model = AutoModel.from_pretrained(model_name)
9+
10+
# 문장 임베딩 생성 함수
11+
def get_sentence_embedding(text: str):
12+
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
13+
with torch.no_grad():
14+
outputs = model(**inputs)
15+
return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

app/models/skt_kobert.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import torch
2+
from transformers import AutoTokenizer, AutoModel
3+
4+
tokenizer = AutoTokenizer.from_pretrained("skt/kobert-base-v1")
5+
model = AutoModel.from_pretrained("skt/kobert-base-v1")
6+
7+
def get_sentence_embedding(text):
8+
inputs = tokenizer(
9+
text,
10+
return_tensors="pt",
11+
padding=True,
12+
truncation=True,
13+
max_length=512
14+
)
15+
inputs.pop("token_type_ids", None)
16+
17+
with torch.no_grad():
18+
outputs = model(**inputs)
19+
return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

app/models/url_classifier.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import re
2+
3+
def separate_text_and_urls(text):
4+
# URL 정규 표현식 패턴 (도메인과 경로 부분만 추출)
5+
url_pattern = re.compile(r'https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}(/\S*)?')
6+
7+
# URL 추출 (finditer 사용)
8+
urls = [match.group() for match in url_pattern.finditer(text)]
9+
10+
# URL을 제외한 일반 텍스트 추출
11+
text_without_urls = url_pattern.sub('', text).strip()
12+
13+
return text_without_urls, urls
14+
15+
# 테스트
16+
sample_text = "이 영화는 너무 재미있어요! 예매는http://ticket.com 에서 할 수 있어요."
17+
text, urls = separate_text_and_urls(sample_text)
18+
19+
print("📌 일반 텍스트:", text)
20+
print("🔗 추출된 URL:", urls)

app/routes/classify.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from fastapi import APIRouter
2+
from pydantic import BaseModel
3+
from typing import List
4+
from app.services.text_processing import classify_or_create_category, category_embeddings
5+
6+
router = APIRouter()
7+
8+
# 요청 데이터 형식 정의
9+
class SentenceRequest(BaseModel):
10+
sentences: List[str] # 문자열 리스트 받기
11+
12+
# 문자열 입력 후 문장 자동 분류 API
13+
@router.post("/classify")
14+
async def classify_text(sentences: SentenceRequest):
15+
categorized_results = classify_or_create_category(sentences.sentences)
16+
return {"categorized_sentences": categorized_results, "category_list": list(category_embeddings.keys())}

app/routes/skt_classify.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from fastapi import APIRouter
2+
from pydantic import BaseModel
3+
from app.services.skt_text_processing import classify_paragraph
4+
5+
router = APIRouter()
6+
7+
class ParagraphRequest(BaseModel):
8+
paragraph: str # 단일 문단 받기
9+
10+
@router.post("/classify_paragraph")
11+
async def classify_paragraph_api(request: ParagraphRequest):
12+
result = classify_paragraph(request.paragraph)
13+
return result

app/server.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import uvicorn
2+
3+
if __name__ == "__main__":
4+
# 서버 구동 이후 디버깅을 위해 reload=True 옵션을 추가한다
5+
uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import re
2+
from sklearn.metrics.pairwise import cosine_similarity
3+
from typing import List, Dict
4+
import numpy as np
5+
from app.models.skt_kobert import get_sentence_embedding
6+
7+
8+
CATEGORY_LABELS = {
9+
"여행": ["도쿄 여행", "유럽 배낭 여행", "국내 캠핑", "비행기 예약", "숙소 추천"],
10+
"교통": ["지하철 환승", "고속열차 이용", "비행기 탑승", "대중교통 이용", "렌터카 예약"],
11+
"쇼핑": ["면세점 할인", "백화점 세일", "전자제품 구매", "패션 브랜드 쇼핑", "기념품 구매"],
12+
"음식": ["일본 라멘 맛집", "프랑스 빵 추천", "한식당 방문", "카페 탐방", "길거리 음식"],
13+
"기타": ["문화 체험", "미술관 방문", "박물관 견학", "자연 탐방", "테마파크 방문"]
14+
}
15+
16+
url_pattern = re.compile(r'https?://[a-zA-Z0-9./?=&_%:-]+')
17+
18+
# ✅ 문장에서 URL을 추출하고 문장 내에서 분리하는 함수
19+
def extract_urls_from_sentences(sentence: str):
20+
urls = url_pattern.findall(sentence)
21+
text_without_urls = url_pattern.sub(' ', sentence).strip()
22+
text_without_urls = re.sub(r'\s+', ' ', text_without_urls)
23+
return text_without_urls, urls
24+
25+
category_embeddings = {
26+
category: np.mean([get_sentence_embedding(example) for example in examples], axis=0)
27+
for category, examples in CATEGORY_LABELS.items()
28+
}
29+
30+
def generate_new_category(text: str):
31+
words = text.split()
32+
for word in words:
33+
if len(word) > 1:
34+
return word
35+
return "기타"
36+
37+
def classify_paragraph(paragraph: str, threshold: float = 0.7):
38+
global category_embeddings
39+
sentences = paragraph.split("\n")
40+
processed_sentences = []
41+
paragraph_embedding = get_sentence_embedding(paragraph)
42+
43+
44+
best_category = None
45+
best_similarity = 0
46+
47+
for category, category_vector in category_embeddings.items():
48+
similarity = cosine_similarity([paragraph_embedding], [category_vector])[0][0]
49+
if similarity > best_similarity:
50+
best_similarity = similarity
51+
best_category = category
52+
53+
# ✅ 기존 카테고리에 없으면 "category": "no", 추천 카테고리 제공
54+
if best_similarity < threshold:
55+
recommend_category = generate_new_category(paragraph)
56+
if recommend_category not in category_embeddings:
57+
category_embeddings[recommend_category] = paragraph_embedding
58+
return_category = "no"
59+
else:
60+
recommend_category = best_category
61+
return_category = best_category
62+
63+
64+
for sentence in sentences:
65+
text_part, urls = extract_urls_from_sentences(sentence)
66+
sentence_embedding = get_sentence_embedding(text_part) if text_part else None
67+
68+
69+
if urls:
70+
sub_category = "관련 링크"
71+
else:
72+
sub_category = recommend_category
73+
74+
processed_sentences.append({
75+
"text": text_part if text_part else "URL 포함 문장",
76+
"sub_category": sub_category,
77+
"urls": urls if urls else None
78+
})
79+
80+
return {
81+
"category": return_category,
82+
"recommend_category": recommend_category,
83+
"sentences": processed_sentences
84+
}

app/services/text_processing.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from sklearn.metrics.pairwise import cosine_similarity
2+
from typing import List, Dict
3+
import numpy as np
4+
from app.models.kobert_classifier import get_sentence_embedding
5+
6+
# 기존 카테고리 저장소 (초기 없음)
7+
category_embeddings: Dict[str, List[np.ndarray]] = {}
8+
9+
# 문장 분류 함수 (유사 카테고리 확인 및 생성)
10+
def classify_or_create_category(sentences: List[str], threshold: float = 0.7):
11+
global category_embeddings
12+
categorized_sentences = {}
13+
14+
for sentence in sentences:
15+
sentence_embedding = get_sentence_embedding(sentence)
16+
17+
best_category = None
18+
best_similarity = 0
19+
20+
# 기존 카테고리와 유사도 비교
21+
for category, embeddings in category_embeddings.items():
22+
similarities = cosine_similarity([sentence_embedding], embeddings).mean()
23+
if similarities > best_similarity:
24+
best_similarity = similarities
25+
best_category = category
26+
27+
# 기존 카테고리 존재 시 추가, 없으면 새 카테고리 생성
28+
if best_similarity >= threshold and best_category:
29+
categorized_sentences.setdefault(best_category, []).append(sentence)
30+
category_embeddings[best_category].append(sentence_embedding)
31+
else:
32+
new_category = f"Category_{len(category_embeddings) + 1}"
33+
category_embeddings[new_category] = [sentence_embedding]
34+
categorized_sentences.setdefault(new_category, []).append(sentence)
35+
36+
return categorized_sentences

0 commit comments

Comments
 (0)