Test strategies generated by LLMs for the "Guess The Nex Move" game

51d9781e · Maxime MORGE · a608e973 · 51d9781e · 51d9781e · 51d9781e
Commit 51d9781e authored 3 months ago by Maxime MORGE
--- a/.idea/csv-editor.xml
+++ b/.idea/csv-editor.xml
@@ -115,6 +115,20 @@
            </Attribute>
          </value>
        </entry>
+        <entry key="$PROJECT_DIR$/data/guess/guess.strategy.csv">
+          <value>
+            <Attribute>
+              <option name="separator" value="," />
+            </Attribute>
+          </value>
+        </entry>
+        <entry key="$PROJECT_DIR$/data/ring/ring.2.a.csv">
+          <value>
+            <Attribute>
+              <option name="separator" value="," />
+            </Attribute>
+          </value>
+        </entry>
      </map>
    </option>
  </component>

--- a/README.md
+++ b/README.md
@@ -205,9 +205,14 @@ in identifying these patterns by calculating the average points earned per round
 The temperature is fixed at 0.7, and each game of 10 round is playerd 30 times.

 The figures below present the average points earned per round for each model against
-the three opponent’s patterns. The 95% confidence interval is also shown.
-We observe that the performance of LLMs, whatever they are  is barely better than that of a random strategy.
-We observe that the performance of LLMs, whether proprietary or open-weight, is barely better than that of a random strategy.
+the three opponent’s patterns regardless of whether the models were prompted to generate 
+a strategy or specific actions. The 95% confidence interval is also shown.
+We find that the action generation performance of LLMs, whether proprietary or open-weight, is 
+only marginally better than a random strategy.
+The strategies generated by the model GPT-4.5 and Mistral-Small predicts the opponent’s next 
+move based on past rounds by identifying the most frequently move by the opponent. While this strategy 
+is effective against the constant behavior, it fails to predict the opponent’s next move when the opponent 
+adopts a more complex pattern. Neither Llama3 nor DeepSeek-R1 were able to generate a valid strategy.

 ![Average Points Earned per Round Against Constant Behaviour (with 95% Confidence Interval)](figures/guess/guess_constant.svg)


--- a/data/guess/guess.csv
+++ b/data/guess/guess.csv
--- a/figures/guess/guess_2loop.svg
+++ b/figures/guess/guess_2loop.svg
--- a/figures/guess/guess_3loop.svg
+++ b/figures/guess/guess_3loop.svg
--- a/figures/guess/guess_constant.svg
+++ b/figures/guess/guess_constant.svg
--- a/src/guess/guess.py
+++ b/src/guess/guess.py
@@ -24,7 +24,7 @@ class AgentResponse(BaseModel):

 # Define Guess simulation class
 class Guess:
-    def __init__(self, model: str, temperature: float, game_id: int, opponent_strategy_fn: Callable[[List[Dict]], str], max_retries: int = 3):
+    def __init__(self, model: str, temperature: float, game_id: int, opponent_strategy_fn: Callable[[List[Dict]], str], strategy=False, max_retries: int = 3):
        self.model = model
        self.temperature = temperature
        self.game_id = game_id
@@ -32,10 +32,9 @@ class Guess:
        self.history: List[Dict] = []
        self.player_score_game = 0
        self.opponent_strategy_fn = opponent_strategy_fn
+        self.strategy = strategy  # Determines whether to use a model or a rule-based method

-        if model == "random":
-            self.model_client = None
-        else:
+        if not strategy:  # Use model-based prediction
            is_openai_model = model.startswith("gpt")
            base_url = "https://api.openai.com/v1" if is_openai_model else "http://localhost:11434/v1"
            model_info = {
@@ -53,57 +52,21 @@ class Guess:
                model_info=model_info,
                response_format=AgentResponse
            )
+        else:
+            self.model_client = None  # No model needed for rule-based strategy

    async def play_round(self, round_id: int) -> Dict:
        """Plays a single round of Guess The Next Move."""
        opponent_move = self.opponent_strategy_fn(self.history)

-        if self.model == "random":
-            move = random.choice(["Rock", "Paper", "Scissor"])
-            reasoning = ""
+        if self.strategy:
+            # Use a simple rule-based prediction
+            move, reasoning = self.apply_strategy()
        else:
-            history_summary = self.get_history_summary()
-            instruction = f"""
-            We are playing a simplified version of Rock-Paper-Scissors (RPS) designed to evaluate your ability to predict the opponent's next move.
-
-            ### **Game Rules:**
-            1. The opponent follows a hidden strategy (which may involve a repeating pattern or adaptive behavior).
-            2. Your task is to predict the opponent’s next move: Rock, Paper, or Scissors.
-            3. If your prediction is correct, you earn **1 point**; otherwise, you earn **0 points**.
-            4. The game continues for multiple rounds, and your accuracy is evaluated at each round.
-
-            ### **Game History So Far:**
-            {history_summary}
-
-            ### **Your Task:**
-            Based on the game history, predict the opponent's next move.  
-            Return your response in JSON format with two keys:  
-            - `"prediction"`: Your predicted move (`"Rock"`, `"Paper"`, or `"Scissor"`).  
-            - `"reasoning"`: A brief explanation of how you made your prediction.
-            """
-            for attempt in range(1, self.max_retries + 1):
-                agent = AssistantAgent(
-                    name="Player",
-                    model_client=self.model_client,
-                    system_message="You are a helpful assistant."
-                )
-                response = await agent.on_messages(
-                    [TextMessage(content=instruction, source="user")],
-                    cancellation_token=CancellationToken(),
-                )
-                try:
-                    response_data = response.chat_message.content
-                    agent_response = AgentResponse.model_validate_json(response_data)
-                    move, reasoning = agent_response.prediction, agent_response.reasoning
-
-                    if move in ["Rock", "Paper", "Scissor"]:
-                        break
-                except (ValidationError, json.JSONDecodeError) as e:
-                    print(f"Error parsing response (Attempt {attempt}): {e}")
-            else:
-                raise ValueError("Model failed to provide a valid response after multiple attempts.")
+            # Use the model for prediction
+            move, reasoning = await self.model_based_prediction()

-        outcome = self.determine_winner(move, opponent_move)
+        outcome = self.determine_accuracy(move, opponent_move)
        self.update_score(outcome)
        round_result = {
            "Agent Prediction": move,
@@ -114,10 +77,88 @@ class Guess:
        self.history.append(round_result)
        return round_result

+    async def model_based_prediction(self):
+        """Generates a move using the AI model."""
+        history_summary = self.get_history_summary()
+        instruction = f"""
+        We are playing a simplified version of Rock-Paper-Scissors (RPS) designed to evaluate your ability to predict the opponent's next move.
+
+        ### **Game Rules:**
+        1. The opponent follows a hidden strategy (which may involve a repeating pattern or adaptive behavior).
+        2. Your task is to predict the opponent’s next move: Rock, Paper, or Scissors.
+        3. If your prediction is correct, you earn **1 point**; otherwise, you earn **0 points**.
+        4. The game continues for multiple rounds, and your accuracy is evaluated at each round.
+
+        ### **Game History So Far:**
+        {history_summary}
+
+        ### **Your Task:**
+        Based on the game history, predict the opponent's next move.  
+        Return your response in JSON format with two keys:  
+        - `"prediction"`: Your predicted move (`"Rock"`, `"Paper"`, or `"Scissor"`).  
+        - `"reasoning"`: A brief explanation of how you made your prediction.
+        """
+        for attempt in range(1, self.max_retries + 1):
+            agent = AssistantAgent(
+                name="Player",
+                model_client=self.model_client,
+                system_message="You are a helpful assistant."
+            )
+            response = await agent.on_messages(
+                [TextMessage(content=instruction, source="user")],
+                cancellation_token=CancellationToken(),
+            )
+            try:
+                response_data = response.chat_message.content
+                agent_response = AgentResponse.model_validate_json(response_data)
+                move, reasoning = agent_response.prediction, agent_response.reasoning
+
+                if move in ["Rock", "Paper", "Scissor"]:
+                    return move, reasoning
+            except (ValidationError, json.JSONDecodeError) as e:
+                print(f"Error parsing response (Attempt {attempt}): {e}")
+        raise ValueError("Model failed to provide a valid response after multiple attempts.")
+
+    def apply_strategy(self):
+        """Predicts the next move using a heuristic."""
+        if self.model == "gpt-4.5-preview-2025-02-27":
+            if not self.history:
+                return random.choice(["Rock", "Paper", "Scissor"]), "No history available. Choosing randomly."
+            # Count occurrences of each move
+            move_counts = {"Rock": 0, "Paper": 0, "Scissor": 0}
+            for round_data in self.history:
+                move_counts[round_data["Opponent Move"]] += 1
+            # Find the most common move
+            most_common_move = max(move_counts, key=move_counts.get)
+            predicted_move = most_common_move
+            reasoning = f"Based on history, the opponent most frequently played {most_common_move}."
+            return predicted_move, reasoning
+        if self.model == "llama3":
+             return ["None", "error"]
+        if self.model == "mistral-small":
+            if not self.history:
+                # If there is no history, we can't make an educated guess.
+                return ["Scissor", "No game history available."]
+            opponent_moves = [move['Opponent Move'] for move in self.history]
+            move_count = {
+                'Rock': opponent_moves.count('Rock'),
+                'Paper': opponent_moves.count('Paper'),
+                'Scissors': opponent_moves.count('Scissor')
+            }
+            # Determine the most frequent move
+            max_move = max(move_count, key=move_count.get)
+            if move_count[max_move] > 0:
+                reasoning = f"Predicted {max_move} because it has been played {move_count[max_move]} times."
+            else:
+                reasoning = "Unable to determine a pattern; defaulting to Scissors."
+            return max_move, reasoning
+        if self.model == "deepseek-r1":
+            return ["None", "error"]
+
    @staticmethod
-    def determine_winner(player_move: str, opponent_move: str) -> int:
-        win_conditions = {"Rock": "Scissor", "Scissor": "Paper", "Paper": "Rock"}
-        return 1 if player_move == win_conditions.get(opponent_move, "") else 0
+    def determine_accuracy(player_move: str, opponent_move: str) -> int:
+        """Determines the accuracy of the prediction."""
+        return 1 if player_move == opponent_move else 0

    def update_score(self, outcome: int):
        """Updates the score based on the outcome."""
@@ -125,6 +166,7 @@ class Guess:
            self.player_score_game += 1

    def get_history_summary(self) -> str:
+        """Summarizes the game history for model-based predictions."""
        if not self.history:
            return "This is the first round."
        summary = "\n".join(
@@ -133,3 +175,20 @@ class Guess:
        )
        summary += f"\nCurrent Score - You: {self.player_score_game}\n"
        return summary
+
+def simple_opponent_strategy(history):
+    """A simple opponent strategy that cycles through Rock, Paper, Scissor."""
+    moves = ["Rock", "Paper", "Scissor"]
+    return moves[len(history) % 3]
+
+async def main():
+    # Play with strategy-based approach
+    game = Guess(model="mistral-small", temperature=0.7, game_id=1, opponent_strategy_fn=simple_opponent_strategy, strategy=True)
+    num_rounds = 10
+    for round_id in range(1, num_rounds + 1):
+        result = await game.play_round(round_id)
+        print(f"Round {round_id}: {result}")
+    print(f"Final Score: {game.player_score_game}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
--- a/src/guess/guess_draw_2loop.py
+++ b/src/guess/guess_draw_2loop.py
@@ -20,10 +20,11 @@ df_filtered = df[df["opponentStrategy"].isin(opponent_strategies)].copy()

 # Custom color palette for models
 color_palette = {
-    'random': '#63656a',  # gray
    'gpt-4.5-preview-2025-02-27': '#7abaff',  # BlueEscape
+    'gpt-4.5-preview-2025-02-27 strategy': '#000037',  # BlueHorizon
    'llama3': '#32a68c',  # vertAvenir
    'mistral-small': '#ff6941',  # orangeChaleureux
+    'mistral-small strategy': '#ffd24b',  # yellow determined
    'deepseek-r1': '#5862ed'  # indigoInclusif
 }

@@ -51,7 +52,7 @@ for model in summary["model"].unique():
    # Plot mean outcome
    plt.plot(df_model["idRound"], df_model["mean_outcome"],
             label=model,
-             color=color_palette.get(model, '#333333'))  # Default to dark gray if model not in palette
+             color = color_palette.get(model, '#63656a'))  # Default to light gray if model not in palette

    # Plot confidence interval as a shaded region
    plt.fill_between(df_model["idRound"],

--- a/src/guess/guess_draw_3loop.py
+++ b/src/guess/guess_draw_3loop.py
@@ -20,10 +20,11 @@ df_filtered = df[df["opponentStrategy"].isin(opponent_strategies)].copy()

 # Custom color palette for models
 color_palette = {
-    'random': '#63656a',  # gray
    'gpt-4.5-preview-2025-02-27': '#7abaff',  # BlueEscape
+    'gpt-4.5-preview-2025-02-27 strategy': '#000037',  # BlueHorizon
    'llama3': '#32a68c',  # vertAvenir
    'mistral-small': '#ff6941',  # orangeChaleureux
+    'mistral-small strategy': '#ffd24b',  # yellow determined
    'deepseek-r1': '#5862ed'  # indigoInclusif
 }

@@ -51,7 +52,7 @@ for model in summary["model"].unique():
    # Plot mean outcome
    plt.plot(df_model["idRound"], df_model["mean_outcome"],
             label=model,
-             color=color_palette.get(model, '#333333'))  # Default to dark gray if model not in palette
+             color = color_palette.get(model, '#63656a'))  # Default to light gray if model not in palette

    # Plot confidence interval as a shaded region
    plt.fill_between(df_model["idRound"],

--- a/src/guess/guess_draw_constant.py
+++ b/src/guess/guess_draw_constant.py
@@ -21,8 +21,10 @@ df_filtered = df[df["opponentStrategy"].isin(opponent_strategies)].copy()
 # Custom color palette for models
 color_palette = {
    'gpt-4.5-preview-2025-02-27': '#7abaff',  # BlueEscape
+    'gpt-4.5-preview-2025-02-27 strategy': '#000037',  # BlueHorizon
    'llama3': '#32a68c',  # vertAvenir
    'mistral-small': '#ff6941',  # orangeChaleureux
+    'mistral-small strategy': '#ffd24b',  # yellow determined
    'deepseek-r1': '#5862ed'  # indigoInclusif
 }

@@ -41,7 +43,7 @@ plt.figure(figsize=(10, 6))
 # Loop through each model and plot its aggregated performance across rounds
 for model in agg_data["model"].unique():
    df_model = agg_data[agg_data["model"] == model]
-    color = color_palette.get(model, '#333333')  # Default to dark gray if model not in palette
+    color = color_palette.get(model, '#63656a')  # Default to light gray if model not in palette

    # Plot mean values
    plt.plot(df_model["idRound"], df_model["mean_outcome"], label=model, color=color)

--- a/src/guess/guess_experiments.py
+++ b/src/guess/guess_experiments.py
@@ -3,12 +3,12 @@ import csv
 import os
 from guess import Guess

-CSV_FILE_PATH = "../../data/guess/guess.csv"
+CSV_FILE_PATH = "../../data/guess/guess.strategy.csv"

 # Define RPS Constant Experiment class
 class GuessExperiment:
    def __init__(self):
-        self.models = ["random", "llama3", "mistral-small", "deepseek-r1"]  #  You can also add "gpt-4.5-preview-2025-02-27"
+        self.models = ["gpt-4.5-preview-2025-02-27", "mistral-small"]  #  You can also add "llama3" "deepseek-r1"
        self.opponent_strategies = {
            "always_rock": lambda history: "Rock",
            "always_paper": lambda history: "Paper",
@@ -20,7 +20,8 @@ class GuessExperiment:
        }
        self.temperature = 0.7
        self.rounds = 10
-        self.num_games_per_config = 5
+        self.num_games_per_config = 30
+        self.strategy = True
        self.initialize_csv()


@@ -74,7 +75,8 @@ class GuessExperiment:
        sanitized_reasoning = self.sanitize_reasoning(reasoning)
        with open(CSV_FILE_PATH, mode="a", newline="") as file:
            writer = csv.writer(file)
-            writer.writerow([game_id, model, opponent_strategy, round_id, agent_prediction, opponent_move, outcome, player_score_game, sanitized_reasoning])
+            model_type = model+" strategy" if self.strategy else model
+            writer.writerow([game_id, model_type, opponent_strategy, round_id, agent_prediction, opponent_move, outcome, player_score_game, sanitized_reasoning])

    async def run_experiment(self):
        """Runs the experiment for all configurations."""
@@ -89,7 +91,7 @@ class GuessExperiment:


    async def run_game(self, model, opponent_strategy_name, opponent_strategy_fn, game_id):
-        game = Guess(model=model, temperature=self.temperature, game_id=game_id, opponent_strategy_fn=opponent_strategy_fn)
+        game = Guess(model=model, temperature=self.temperature, game_id=game_id, opponent_strategy_fn=opponent_strategy_fn, strategy = True)
        for i in range(1, self.rounds + 1):
            round_data = await game.play_round(i)
            self.log_to_csv(game_id, model, opponent_strategy_name, i, round_data["Agent Prediction"], round_data["Opponent Move"], round_data["Outcome"], game.player_score_game, round_data["Reasoning"])