Skip to content
Snippets Groups Projects
Commit 51d9781e authored by Maxime MORGE's avatar Maxime MORGE
Browse files

Test strategies generated by LLMs for the "Guess The Nex Move" game

parent a608e973
No related branches found
No related tags found
No related merge requests found
......@@ -115,6 +115,20 @@
</Attribute>
</value>
</entry>
<entry key="$PROJECT_DIR$/data/guess/guess.strategy.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="$PROJECT_DIR$/data/ring/ring.2.a.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
</map>
</option>
</component>
......
......@@ -205,9 +205,14 @@ in identifying these patterns by calculating the average points earned per round
The temperature is fixed at 0.7, and each game of 10 round is playerd 30 times.
The figures below present the average points earned per round for each model against
the three opponent’s patterns. The 95% confidence interval is also shown.
We observe that the performance of LLMs, whatever they are is barely better than that of a random strategy.
We observe that the performance of LLMs, whether proprietary or open-weight, is barely better than that of a random strategy.
the three opponent’s patterns regardless of whether the models were prompted to generate
a strategy or specific actions. The 95% confidence interval is also shown.
We find that the action generation performance of LLMs, whether proprietary or open-weight, is
only marginally better than a random strategy.
The strategies generated by the model GPT-4.5 and Mistral-Small predicts the opponent’s next
move based on past rounds by identifying the most frequently move by the opponent. While this strategy
is effective against the constant behavior, it fails to predict the opponent’s next move when the opponent
adopts a more complex pattern. Neither Llama3 nor DeepSeek-R1 were able to generate a valid strategy.
![Average Points Earned per Round Against Constant Behaviour (with 95% Confidence Interval)](figures/guess/guess_constant.svg)
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -24,7 +24,7 @@ class AgentResponse(BaseModel):
# Define Guess simulation class
class Guess:
def __init__(self, model: str, temperature: float, game_id: int, opponent_strategy_fn: Callable[[List[Dict]], str], max_retries: int = 3):
def __init__(self, model: str, temperature: float, game_id: int, opponent_strategy_fn: Callable[[List[Dict]], str], strategy=False, max_retries: int = 3):
self.model = model
self.temperature = temperature
self.game_id = game_id
......@@ -32,10 +32,9 @@ class Guess:
self.history: List[Dict] = []
self.player_score_game = 0
self.opponent_strategy_fn = opponent_strategy_fn
self.strategy = strategy # Determines whether to use a model or a rule-based method
if model == "random":
self.model_client = None
else:
if not strategy: # Use model-based prediction
is_openai_model = model.startswith("gpt")
base_url = "https://api.openai.com/v1" if is_openai_model else "http://localhost:11434/v1"
model_info = {
......@@ -53,57 +52,21 @@ class Guess:
model_info=model_info,
response_format=AgentResponse
)
else:
self.model_client = None # No model needed for rule-based strategy
async def play_round(self, round_id: int) -> Dict:
"""Plays a single round of Guess The Next Move."""
opponent_move = self.opponent_strategy_fn(self.history)
if self.model == "random":
move = random.choice(["Rock", "Paper", "Scissor"])
reasoning = ""
if self.strategy:
# Use a simple rule-based prediction
move, reasoning = self.apply_strategy()
else:
history_summary = self.get_history_summary()
instruction = f"""
We are playing a simplified version of Rock-Paper-Scissors (RPS) designed to evaluate your ability to predict the opponent's next move.
### **Game Rules:**
1. The opponent follows a hidden strategy (which may involve a repeating pattern or adaptive behavior).
2. Your task is to predict the opponent’s next move: Rock, Paper, or Scissors.
3. If your prediction is correct, you earn **1 point**; otherwise, you earn **0 points**.
4. The game continues for multiple rounds, and your accuracy is evaluated at each round.
### **Game History So Far:**
{history_summary}
### **Your Task:**
Based on the game history, predict the opponent's next move.
Return your response in JSON format with two keys:
- `"prediction"`: Your predicted move (`"Rock"`, `"Paper"`, or `"Scissor"`).
- `"reasoning"`: A brief explanation of how you made your prediction.
"""
for attempt in range(1, self.max_retries + 1):
agent = AssistantAgent(
name="Player",
model_client=self.model_client,
system_message="You are a helpful assistant."
)
response = await agent.on_messages(
[TextMessage(content=instruction, source="user")],
cancellation_token=CancellationToken(),
)
try:
response_data = response.chat_message.content
agent_response = AgentResponse.model_validate_json(response_data)
move, reasoning = agent_response.prediction, agent_response.reasoning
if move in ["Rock", "Paper", "Scissor"]:
break
except (ValidationError, json.JSONDecodeError) as e:
print(f"Error parsing response (Attempt {attempt}): {e}")
else:
raise ValueError("Model failed to provide a valid response after multiple attempts.")
# Use the model for prediction
move, reasoning = await self.model_based_prediction()
outcome = self.determine_winner(move, opponent_move)
outcome = self.determine_accuracy(move, opponent_move)
self.update_score(outcome)
round_result = {
"Agent Prediction": move,
......@@ -114,10 +77,88 @@ class Guess:
self.history.append(round_result)
return round_result
async def model_based_prediction(self):
"""Generates a move using the AI model."""
history_summary = self.get_history_summary()
instruction = f"""
We are playing a simplified version of Rock-Paper-Scissors (RPS) designed to evaluate your ability to predict the opponent's next move.
### **Game Rules:**
1. The opponent follows a hidden strategy (which may involve a repeating pattern or adaptive behavior).
2. Your task is to predict the opponent’s next move: Rock, Paper, or Scissors.
3. If your prediction is correct, you earn **1 point**; otherwise, you earn **0 points**.
4. The game continues for multiple rounds, and your accuracy is evaluated at each round.
### **Game History So Far:**
{history_summary}
### **Your Task:**
Based on the game history, predict the opponent's next move.
Return your response in JSON format with two keys:
- `"prediction"`: Your predicted move (`"Rock"`, `"Paper"`, or `"Scissor"`).
- `"reasoning"`: A brief explanation of how you made your prediction.
"""
for attempt in range(1, self.max_retries + 1):
agent = AssistantAgent(
name="Player",
model_client=self.model_client,
system_message="You are a helpful assistant."
)
response = await agent.on_messages(
[TextMessage(content=instruction, source="user")],
cancellation_token=CancellationToken(),
)
try:
response_data = response.chat_message.content
agent_response = AgentResponse.model_validate_json(response_data)
move, reasoning = agent_response.prediction, agent_response.reasoning
if move in ["Rock", "Paper", "Scissor"]:
return move, reasoning
except (ValidationError, json.JSONDecodeError) as e:
print(f"Error parsing response (Attempt {attempt}): {e}")
raise ValueError("Model failed to provide a valid response after multiple attempts.")
def apply_strategy(self):
"""Predicts the next move using a heuristic."""
if self.model == "gpt-4.5-preview-2025-02-27":
if not self.history:
return random.choice(["Rock", "Paper", "Scissor"]), "No history available. Choosing randomly."
# Count occurrences of each move
move_counts = {"Rock": 0, "Paper": 0, "Scissor": 0}
for round_data in self.history:
move_counts[round_data["Opponent Move"]] += 1
# Find the most common move
most_common_move = max(move_counts, key=move_counts.get)
predicted_move = most_common_move
reasoning = f"Based on history, the opponent most frequently played {most_common_move}."
return predicted_move, reasoning
if self.model == "llama3":
return ["None", "error"]
if self.model == "mistral-small":
if not self.history:
# If there is no history, we can't make an educated guess.
return ["Scissor", "No game history available."]
opponent_moves = [move['Opponent Move'] for move in self.history]
move_count = {
'Rock': opponent_moves.count('Rock'),
'Paper': opponent_moves.count('Paper'),
'Scissors': opponent_moves.count('Scissor')
}
# Determine the most frequent move
max_move = max(move_count, key=move_count.get)
if move_count[max_move] > 0:
reasoning = f"Predicted {max_move} because it has been played {move_count[max_move]} times."
else:
reasoning = "Unable to determine a pattern; defaulting to Scissors."
return max_move, reasoning
if self.model == "deepseek-r1":
return ["None", "error"]
@staticmethod
def determine_winner(player_move: str, opponent_move: str) -> int:
win_conditions = {"Rock": "Scissor", "Scissor": "Paper", "Paper": "Rock"}
return 1 if player_move == win_conditions.get(opponent_move, "") else 0
def determine_accuracy(player_move: str, opponent_move: str) -> int:
"""Determines the accuracy of the prediction."""
return 1 if player_move == opponent_move else 0
def update_score(self, outcome: int):
"""Updates the score based on the outcome."""
......@@ -125,6 +166,7 @@ class Guess:
self.player_score_game += 1
def get_history_summary(self) -> str:
"""Summarizes the game history for model-based predictions."""
if not self.history:
return "This is the first round."
summary = "\n".join(
......@@ -133,3 +175,20 @@ class Guess:
)
summary += f"\nCurrent Score - You: {self.player_score_game}\n"
return summary
def simple_opponent_strategy(history):
"""A simple opponent strategy that cycles through Rock, Paper, Scissor."""
moves = ["Rock", "Paper", "Scissor"]
return moves[len(history) % 3]
async def main():
# Play with strategy-based approach
game = Guess(model="mistral-small", temperature=0.7, game_id=1, opponent_strategy_fn=simple_opponent_strategy, strategy=True)
num_rounds = 10
for round_id in range(1, num_rounds + 1):
result = await game.play_round(round_id)
print(f"Round {round_id}: {result}")
print(f"Final Score: {game.player_score_game}")
if __name__ == "__main__":
asyncio.run(main())
\ No newline at end of file
......@@ -20,10 +20,11 @@ df_filtered = df[df["opponentStrategy"].isin(opponent_strategies)].copy()
# Custom color palette for models
color_palette = {
'random': '#63656a', # gray
'gpt-4.5-preview-2025-02-27': '#7abaff', # BlueEscape
'gpt-4.5-preview-2025-02-27 strategy': '#000037', # BlueHorizon
'llama3': '#32a68c', # vertAvenir
'mistral-small': '#ff6941', # orangeChaleureux
'mistral-small strategy': '#ffd24b', # yellow determined
'deepseek-r1': '#5862ed' # indigoInclusif
}
......@@ -51,7 +52,7 @@ for model in summary["model"].unique():
# Plot mean outcome
plt.plot(df_model["idRound"], df_model["mean_outcome"],
label=model,
color=color_palette.get(model, '#333333')) # Default to dark gray if model not in palette
color = color_palette.get(model, '#63656a')) # Default to light gray if model not in palette
# Plot confidence interval as a shaded region
plt.fill_between(df_model["idRound"],
......
......@@ -20,10 +20,11 @@ df_filtered = df[df["opponentStrategy"].isin(opponent_strategies)].copy()
# Custom color palette for models
color_palette = {
'random': '#63656a', # gray
'gpt-4.5-preview-2025-02-27': '#7abaff', # BlueEscape
'gpt-4.5-preview-2025-02-27 strategy': '#000037', # BlueHorizon
'llama3': '#32a68c', # vertAvenir
'mistral-small': '#ff6941', # orangeChaleureux
'mistral-small strategy': '#ffd24b', # yellow determined
'deepseek-r1': '#5862ed' # indigoInclusif
}
......@@ -51,7 +52,7 @@ for model in summary["model"].unique():
# Plot mean outcome
plt.plot(df_model["idRound"], df_model["mean_outcome"],
label=model,
color=color_palette.get(model, '#333333')) # Default to dark gray if model not in palette
color = color_palette.get(model, '#63656a')) # Default to light gray if model not in palette
# Plot confidence interval as a shaded region
plt.fill_between(df_model["idRound"],
......
......@@ -21,8 +21,10 @@ df_filtered = df[df["opponentStrategy"].isin(opponent_strategies)].copy()
# Custom color palette for models
color_palette = {
'gpt-4.5-preview-2025-02-27': '#7abaff', # BlueEscape
'gpt-4.5-preview-2025-02-27 strategy': '#000037', # BlueHorizon
'llama3': '#32a68c', # vertAvenir
'mistral-small': '#ff6941', # orangeChaleureux
'mistral-small strategy': '#ffd24b', # yellow determined
'deepseek-r1': '#5862ed' # indigoInclusif
}
......@@ -41,7 +43,7 @@ plt.figure(figsize=(10, 6))
# Loop through each model and plot its aggregated performance across rounds
for model in agg_data["model"].unique():
df_model = agg_data[agg_data["model"] == model]
color = color_palette.get(model, '#333333') # Default to dark gray if model not in palette
color = color_palette.get(model, '#63656a') # Default to light gray if model not in palette
# Plot mean values
plt.plot(df_model["idRound"], df_model["mean_outcome"], label=model, color=color)
......
......@@ -3,12 +3,12 @@ import csv
import os
from guess import Guess
CSV_FILE_PATH = "../../data/guess/guess.csv"
CSV_FILE_PATH = "../../data/guess/guess.strategy.csv"
# Define RPS Constant Experiment class
class GuessExperiment:
def __init__(self):
self.models = ["random", "llama3", "mistral-small", "deepseek-r1"] # You can also add "gpt-4.5-preview-2025-02-27"
self.models = ["gpt-4.5-preview-2025-02-27", "mistral-small"] # You can also add "llama3" "deepseek-r1"
self.opponent_strategies = {
"always_rock": lambda history: "Rock",
"always_paper": lambda history: "Paper",
......@@ -20,7 +20,8 @@ class GuessExperiment:
}
self.temperature = 0.7
self.rounds = 10
self.num_games_per_config = 5
self.num_games_per_config = 30
self.strategy = True
self.initialize_csv()
......@@ -74,7 +75,8 @@ class GuessExperiment:
sanitized_reasoning = self.sanitize_reasoning(reasoning)
with open(CSV_FILE_PATH, mode="a", newline="") as file:
writer = csv.writer(file)
writer.writerow([game_id, model, opponent_strategy, round_id, agent_prediction, opponent_move, outcome, player_score_game, sanitized_reasoning])
model_type = model+" strategy" if self.strategy else model
writer.writerow([game_id, model_type, opponent_strategy, round_id, agent_prediction, opponent_move, outcome, player_score_game, sanitized_reasoning])
async def run_experiment(self):
"""Runs the experiment for all configurations."""
......@@ -89,7 +91,7 @@ class GuessExperiment:
async def run_game(self, model, opponent_strategy_name, opponent_strategy_fn, game_id):
game = Guess(model=model, temperature=self.temperature, game_id=game_id, opponent_strategy_fn=opponent_strategy_fn)
game = Guess(model=model, temperature=self.temperature, game_id=game_id, opponent_strategy_fn=opponent_strategy_fn, strategy = True)
for i in range(1, self.rounds + 1):
round_data = await game.play_round(i)
self.log_to_csv(game_id, model, opponent_strategy_name, i, round_data["Agent Prediction"], round_data["Opponent Move"], round_data["Outcome"], game.player_score_game, round_data["Reasoning"])
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment