Skip to content

Commit af3a3b3

Browse files
authored
Refactor: use batch generation in code quality evaluator (#440)
1 parent 6bb7c57 commit af3a3b3

File tree

1 file changed

+34
-23
lines changed

1 file changed

+34
-23
lines changed

dataflow/operators/code/eval/code_quality_sample_evaluator.py

Lines changed: 34 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -83,25 +83,25 @@ def _validate_dataframe(self, dataframe: pd.DataFrame):
8383
if conflict:
8484
raise ValueError(f"The following column(s) already exist and would be overwritten by CodeQualitySampleEvaluator: {conflict}")
8585

86-
def _score_func(self, instruction: str, code: str) -> Tuple[int, str]:
86+
def _build_prompts(self, dataframe: pd.DataFrame, input_instruction_key: str, input_code_key: str) -> List[str]:
8787
"""
88-
Evaluate a single instruction-code pair and return score and feedback.
88+
Build prompts for all instruction-code pairs in the dataframe.
8989
9090
Args:
91-
instruction: The instruction text
92-
code: The generated code text
91+
dataframe: Input DataFrame
92+
input_instruction_key: Field name containing instructions
93+
input_code_key: Field name containing code
9394
9495
Returns:
95-
Tuple of (score, feedback) where score is an integer and feedback is a string
96+
List of prompt strings
9697
"""
97-
prompt = self.prompt_template.build_prompt(instruction=instruction, code=code)
98-
response = self.llm_serving.generate_from_input(user_inputs=[prompt], system_prompt="")
99-
100-
if not response or len(response) == 0:
101-
self.logger.warning("Empty response from LLM")
102-
return 0, "No response from LLM"
103-
104-
return self._parse_score_and_feedback(response[0])
98+
prompts = []
99+
for _, row in dataframe.iterrows():
100+
instruction = row[input_instruction_key]
101+
code = row[input_code_key]
102+
prompt = self.prompt_template.build_prompt(instruction=instruction, code=code)
103+
prompts.append(prompt)
104+
return prompts
105105

106106
def _parse_score_and_feedback(self, response: str) -> Tuple[int, str]:
107107
"""
@@ -133,23 +133,34 @@ def eval(self, dataframe: pd.DataFrame, input_instruction_key: str, input_code_k
133133
134134
Args:
135135
dataframe: Input DataFrame
136-
input_key: Field name containing instruction-code pairs (as dict with 'instruction' and 'code' keys)
136+
input_instruction_key: Field name containing instructions
137+
input_code_key: Field name containing code
137138
138139
Returns:
139140
Tuple of (scores, feedbacks) lists
140141
"""
141142
self.logger.info(f"Evaluating {self.score_name}...")
142143

143-
scores = []
144-
feedbacks = []
144+
# Build all prompts at once
145+
prompts = self._build_prompts(dataframe, input_instruction_key, input_code_key)
145146

146-
for _, row in dataframe.iterrows():
147-
instruction = row[input_instruction_key]
148-
code = row[input_code_key]
149-
150-
score, feedback = self._score_func(instruction, code)
151-
scores.append(score)
152-
feedbacks.append(feedback)
147+
# Generate responses in batch
148+
responses = self.llm_serving.generate_from_input(user_inputs=prompts, system_prompt="")
149+
150+
if not responses or len(responses) == 0:
151+
self.logger.warning("Empty response from LLM")
152+
scores = [0] * len(dataframe)
153+
feedbacks = ["No response from LLM"] * len(dataframe)
154+
elif len(responses) != len(dataframe):
155+
self.logger.warning(f"Response count ({len(responses)}) doesn't match dataframe length ({len(dataframe)})")
156+
scores = [0] * len(dataframe)
157+
feedbacks = ["Response count mismatch"] * len(dataframe)
158+
else:
159+
# Parse all responses
160+
results = [self._parse_score_and_feedback(response) for response in responses]
161+
scores, feedbacks = zip(*results)
162+
scores = list(scores)
163+
feedbacks = list(feedbacks)
153164

154165
self.logger.info("Evaluation complete!")
155166
return scores, feedbacks

0 commit comments

Comments
 (0)