Welcome UFCPredictor documentation

Data processor

Data processing module for UFC fight data.

Provides classes to prepare and normalize data for model training and evaluation. Handles data transformation, normalization, and feature engineering.

`DataProcessor`

A data processor class designed to prepare and normalize UFC fight data for training and testing neural network models.

This class provides a way to handle and transform raw data into a format suitable for model training and evaluation.

The DataProcessor is designed to work with the dataset classes in ufcpredictor.datasets to provide a seamless data preparation workflow.

Source code in ufcpredictor/data_processor.py

class DataProcessor:
    """
    A data processor class designed to prepare and normalize UFC fight data for
    training and testing neural network models.

    This class provides a way to handle and transform raw data into a format suitable
    for model training and evaluation.

    The DataProcessor is designed to work with the dataset classes in
    ufcpredictor.datasets to provide a seamless data preparation workflow.
    """

    mlflow_params: List[str] = []
    normalization_factors: Dict[str, float] = {}

    def __init__(
        self,
        data_folder: Optional[Path | str] = None,
        ufc_scraper: Optional[UFCScraper] = None,
        bfo_scraper: Optional[BestFightOddsScraper] = None,
        data_aggregator: Optional[DataAggregator] = None,
        data_enhancers: List[DataEnhancer] = [],
    ) -> None:
        """
        Constructor for DataProcessor.

        Args:
            data_folder: The folder containing the data.
            ufc_scraper: The scraper to use for ufc data.
            bfo_scraper: The scraper to use for best fight odds data.
            data_aggregator: The data aggregator to use for aggregating data.
            data_enhancers: The list of data enhancers to apply to the data.

        Raises:
            ValueError: If data_folder is None and both ufc_scraper and
                bfo_scraper are None.
        """
        if data_folder is None and (ufc_scraper is None or bfo_scraper is None):
            raise ValueError(
                "If data_folder is None, both ufc_scraper and bfo_scraper "
                "should be provided"
            )

        self.scraper = ufc_scraper or UFCScraper(data_folder=data_folder)
        self.bfo_scraper = bfo_scraper or BestFightOddsScraper(
            data_folder=data_folder, n_sessions=-1
        )

        self.data_aggregator = data_aggregator or DefaultDataAggregator()
        self.data_enhancers = data_enhancers

    def load_data(self) -> None:
        """
        Loads and processes all the data.

        First, it joins all the relevant dataframes (fight, fighter, event, and odds).
        Then, it fixes the date and time fields, converts the odds to decimal format,
        fills the weight for each fighter (if not available), adds key statistics
        (KO, Submission, and Win), and applies filters to the data.
        Finally, it groups the round data by fighter and fight, and assigns the result
        to the data attribute.

        This method should be called before any other method.
        """
        data = self.join_dataframes()
        data = self.fix_date_and_time_fields(data)
        data = self.convert_odds_to_decimal(data)
        data = self.fill_weight(data)
        data = self.add_key_stats(data)
        data = self.apply_filters(data)
        self.data = self.group_round_data(data)
        self.data["num_fight"] = self.data.groupby("fighter_id").cumcount() + 1

        for data_enhancer in self.data_enhancers:
            self.data = data_enhancer.add_data_fields(self.data)

        names = self.data["fighter_name"].values
        ids = self.data["fighter_id"].values

        self.fighter_names = {id_: name_ for id_, name_ in zip(ids, names)}
        self.fighter_ids = {name_: id_ for id_, name_ in zip(ids, names)}

    def get_fighter_name(self, id_: str) -> str:
        """
        Returns the name of the fighter with the given id.

        Args:cla
            id_: The id of the fighter.

        Returns:
            The name of the fighter.
        """
        return self.fighter_names[id_]

    def get_fighter_id(self, name: str) -> str:
        """
        Returns the id of the fighter with the given name.
        Search is performed using fuzzywuzzy.
        If multiple matches are found, the first one is returned.

        Args:
            name: The name of the fighter.

        Returns:
            The id of the fighter.
        """
        best_name, score = extractOne(name, self.fighter_ids.keys())

        if score < 100:
            logger.warning(
                f"Fighter found for {name} with {score}% accuracy: {best_name}"
            )
        return self.fighter_ids[best_name]

    def join_dataframes(self) -> pd.DataFrame:
        """
        Joins all the relevant dataframes (fight, fighter, event, and odds).

        It duplicates the current fight data to create two rows per match,
        one row for each fighter, and assigns fighter and opponent to each other.
        Then, it merges the fighter data, round data, and odds data to the
        previous table. Finally, it adds the date of the event to the dataframe.

        Returns:
            The joined dataframe.
        """
        fight_data = self.scraper.fight_scraper.data
        round_data = self.scraper.fight_scraper.rounds_handler.data
        fighter_data = self.scraper.fighter_scraper.data
        event_data = self.scraper.event_scraper.data

        odds_data = self.bfo_scraper.data

        ###########################################################
        # I want to create two rows per match, one row for each fighter
        ###########################################################
        # Hence I need to duplicate the current fight data
        # Assigning fighter and opponent to each other
        data = pd.concat(
            [
                fight_data.rename(
                    columns={
                        "fighter_1": "opponent_id",
                        "fighter_2": "fighter_id",
                        "scores_1": "opponent_score",
                        "scores_2": "fighter_score",
                    }
                ),
                fight_data.rename(
                    columns={
                        "fighter_2": "opponent_id",
                        "fighter_1": "fighter_id",
                        "scores_2": "opponent_score",
                        "scores_1": "fighter_score",
                    }
                ),
            ]
        )

        # I am merging the fighter data to the previous table
        # This includes height, reach etc...
        fighter_data["fighter_name"] = (
            fighter_data["fighter_f_name"]
            + " "
            + fighter_data["fighter_l_name"].fillna("")
        )
        data = data.merge(
            fighter_data,  # [fighter_fields],
            on="fighter_id",
            how="left",
        )

        data = data.merge(
            fighter_data[["fighter_id", "fighter_name", "fighter_nickname"]],
            left_on="opponent_id",
            right_on="fighter_id",
            how="left",
            suffixes=("", "_opponent"),
        )

        #############################################################
        # Add round data.
        #############################################################

        # Merging columns
        round_data = pd.merge(
            round_data,
            round_data,
            on=["fight_id", "round"],
            suffixes=("", "_opponent"),
        )

        # And then remove the match of the fighter with itself
        round_data = round_data[
            round_data["fighter_id"] != round_data["fighter_id_opponent"]
        ]

        data = data.merge(
            round_data,
            on=[
                "fight_id",
                "fighter_id",
                "fighter_id_opponent",
            ],
        )

        ##############################################################
        # Add odds data
        ###############################################################
        data = data.merge(
            odds_data,
            on=["fight_id", "fighter_id"],
        )

        # Add the date of the event to the dataframe
        data = data.merge(
            event_data[["event_id", "event_date"]],  # I only need the date for now,
            on="event_id",
        )

        return data

    @staticmethod
    def fix_date_and_time_fields(data: pd.DataFrame) -> pd.DataFrame:
        """
        Fix date and time fields in the dataframe.

        This function takes care of converting control time, finish time
        and total time from minutes to seconds. It also converts the
        event date and fighter date of birth to datetime objects.

        The dataframe is then sorted by fighter id and event date.

        Args:
            data: The dataframe to be processed.

        Returns:
            The dataframe with the fields fixed.
        """
        data["ctrl_time"] = data["ctrl_time"].apply(convert_minutes_to_seconds)
        data["ctrl_time_opponent"] = data["ctrl_time_opponent"].apply(
            convert_minutes_to_seconds
        )
        data["finish_time"] = data["finish_time"].apply(convert_minutes_to_seconds)
        data["total_time"] = (data["finish_round"] - 1) * 5 * 60 + data["finish_time"]
        data["event_date"] = pd.to_datetime(data["event_date"])
        data["fighter_dob"] = pd.to_datetime(data["fighter_dob"])

        data = data.sort_values(by=["fighter_id", "event_date"])

        return data

    @staticmethod
    def convert_odds_to_decimal(data: pd.DataFrame) -> pd.DataFrame:
        """
        Convert odds from American format to decimal format.

        Args:
            data: The dataframe with the odds in American format.

        Returns:
            The dataframe with the odds in decimal format.
        """
        for field in "opening", "closing_range_min", "closing_range_max":
            data[field] = data[field].astype(float)
            msk = data[field] > 0

            data.loc[msk, field] = data.loc[msk, field] / 100 + 1
            data.loc[~msk, field] = 100 / -data.loc[~msk, field] + 1

        return data

    @staticmethod
    def fill_weight(data: pd.DataFrame) -> pd.DataFrame:
        """
        Fill the weight column using the weight_class column and the weight_dict.

        The weight_dict is a dictionary mapping the weight classes to their
        corresponding weights in lbs. The weights are then filled in the weight
        column according to the weight classes in the weight_class column.

        This function also removes rows with null weight classes, or open weight
        or catch weight (agreed weight outside a weight class).

        Args:
            data: The dataframe to be processed.

        Returns:
            The dataframe with the weight column filled.
        """
        data.loc[:, "weight"] = data["weight_class"].map(weight_dict)

        ##################################################################################
        # Remove null weight classes, or open weight or catch weight (agreed weight outside a weight class)
        ##################################################################################
        data = data[
            (data["weight_class"] != "NULL")
            & (data["weight_class"] != "Catch Weight")
            & (data["weight_class"] != "Open Weight")
        ]

        return data

    @staticmethod
    def add_key_stats(data: pd.DataFrame) -> pd.DataFrame:
        """
        Add key statistics to the dataframe.

        This function adds columns to the dataframe indicating whether a fighter
        has won a fight via KO, submission or decision, and whether the opponent
        has won a fight via KO, submission or decision. It also adds a column
        indicating the age of the fighter at the time of the fight.

        Args:
            data: The dataframe to be processed.

        Returns:
            The dataframe with the added columns.
        """
        #############################################
        # Add some missing stats
        # KO, Submission and Win
        #############################################
        # Whether fighter has KOd his opponent
        data["KO"] = np.where(
            (data["result"].str.contains("KO"))
            & (data["winner"] == data["fighter_id"]),
            1,
            0,
        )

        # Whether the fighter has been KOd by his opponent
        data["KO_opponent"] = np.where(
            (data["result"].str.contains("KO"))
            & (data["winner"] != data["fighter_id"]),
            1,
            0,
        )

        # Same for submission
        data["Sub"] = np.where(
            (data["result"].str.contains("Submission"))
            & (data["winner"] == data["fighter_id"]),
            1,
            0,
        )

        data["Sub_opponent"] = np.where(
            (data["result"].str.contains("Submission"))
            & (data["winner"] != data["fighter_id"]),
            1,
            0,
        )

        data["win"] = np.where(data["winner"] == data["fighter_id"], 1, 0)
        data["win_opponent"] = np.where(data["winner"] != data["fighter_id"], 1, 0)
        data["age"] = (data["event_date"] - data["fighter_dob"]).dt.days / 365

        return data

    @staticmethod
    def apply_filters(data: pd.DataFrame) -> pd.DataFrame:
        """
        Apply filters to the dataframe.

        This function applies filters to the dataframe to remove fights:
        - Before 2008, 8, 1, since I don't have odds for these
        - With non-standard fight formats (time_format not in ["3 Rnd (5-5-5)", "5 Rnd (5-5-5-5-5)"])
        - With female fighters (gender not in ["M"])
        - With disqualified or doctor's stoppage results (result not in ["Decision", "KO/TKO", "Submission"])
        - With draws or invalid winners (winner not in ("Draw", "NC") or winner.isna())

        Args:
            data: The dataframe to be processed.

        Returns:
            The dataframe with the applied filters.
        """
        # Remove old fights since I don't have odds for these
        data = data[data["event_date"].dt.date >= datetime.date(2008, 8, 1)]

        # Remove non-standard fight format
        data = data[data["time_format"].isin(["3 Rnd (5-5-5)", "5 Rnd (5-5-5-5-5)"])]

        # Remove female fights
        data = data[data["gender"] == "M"]

        # Remove disqualified and doctor's stoppage
        data = data[data["result"].isin(["Decision", "KO/TKO", "Submission"])]

        # Remove draws and invalid and NC
        data = data[(~data["winner"].isin(["Draw", "NC"])) & (~data["winner"].isna())]

        return data

    @property
    def round_stat_names(self) -> List[str]:
        """
        The names of the round statistics.

        This property returns the names of the columns in the rounds data
        that are not in ["fight_id", "fighter_id", "round"]. It also returns
        the same names with "_opponent" appended, to represent the opponent's
        statistics.

        Returns:
            A list of strings, the names of the round statistics.
        """
        return [
            c
            for c in self.scraper.fight_scraper.rounds_handler.dtypes.keys()
            if c not in ["fight_id", "fighter_id", "round"]
        ] + [
            c + "_opponent"
            for c in self.scraper.fight_scraper.rounds_handler.dtypes.keys()
            if c not in ["fight_id", "fighter_id", "round"]
        ]

    @property
    def stat_names(self) -> List[str]:
        """
        The names of the statistics.

        This property returns the names of the columns in the rounds data
        that are not in ["fight_id", "fighter_id", "round"]. It also returns
        the same names with "_opponent" appended, to represent the opponent's
        statistics, and the names of the columns "KO", "Sub" and "win",
        which are the result of the fight, with "_opponent" appended to
        represent the opponent's result.

        Returns:
            A list of strings, the names of the statistics.
        """
        stat_names = self.round_stat_names
        for field in ("KO", "Sub", "win"):
            stat_names += [field, field + "_opponent"]

        return stat_names

    @property
    def aggregated_fields(self) -> List[str]:
        """
        The fields that are aggregated over the fighter's history.

        This property returns all the statistic names, including the ones
        with "_opponent" appended to represent the opponent's statistics.
        It also returns the aggregated fields added by the data enhancers.

        Returns:
            A list of strings, the names of the aggregated fields.
        """
        aggregated_fields = self.stat_names

        for data_enhancer in self.data_enhancers:
            aggregated_fields += data_enhancer.aggregated_fields

        return aggregated_fields

    @property
    def normalized_fields(self) -> List[str]:
        """
        The fields that are normalized over the fighter's history.

        These fields are normalized in the sense that they are divided by
        their mean value in the history of the fighter. This is done to
        reduce the effect of outliers and to make the data more comparable
        between different fighters.

        The fields normalized are:
        - "age"
        - "time_since_last_fight"
        - "fighter_height_cm"
        - "weight",
        - All the aggregated fields (see :meth:`aggregated_fields`),
          and the same fields with "_per_minute" and "_per_fight" appended,
          which represent the aggregated fields per minute and per fight,
          respectively.

        It also returns the normalized fields added by the data enhancers.

        Returns:
            A list of strings, the names of the normalized fields.
        """
        normalized_fields = [
            "age",
            "time_since_last_fight",
            "fighter_height_cm",
            "weight",
        ]

        for field in self.aggregated_fields:
            normalized_fields += [field, field + "_per_minute", field + "_per_fight"]

        for data_enhancer in self.data_enhancers:
            normalized_fields += data_enhancer.normalized_fields

        return normalized_fields

    def group_round_data(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Group the round data by the fixed fields and sum the round statistics.

        The fixed fields are the columns in the data that are not in the round
        statistics and not in ["round"]. The round statistics are the columns
        in the data that are in the round statistics and not in ["round"].

        Args:
            data: The data to be grouped.

        Returns:
            The grouped data, with the round statistics summed.
        """
        fixed_fields = [
            c
            for c in data.columns
            if c
            not in self.round_stat_names
            + [
                "round",
            ]
        ]

        return (
            data.groupby(
                fixed_fields, dropna=False
            )  # Important to group nans as valid values.
            .sum()
            .reset_index()
            .drop("round", axis=1)
        ).sort_values(by=["fighter_id", "event_date"])

    def aggregate_data(self) -> None:
        """
        Aggregate the data by combining the round statistics over the history of the
        fighters.

        The aggregated data is stored in the attribute data_aggregated.

        The specific implementation depends on the DataAggregator used.
        """
        self.data_aggregated = self.data_aggregator.aggregate_data(self)

        for data_enhancer in self.data_enhancers:
            self.data_aggregated = data_enhancer.add_aggregated_fields(
                self.data_aggregated
            )

    def add_per_minute_and_fight_stats(self) -> None:
        """
        Add two new columns to the aggregated data for each statistic.

        The first column is the statistic per minute, computed by dividing the
        statistic by the total time in the octagon. The second column is the
        statistic per fight, computed by dividing the statistic by the number
        of fights.

        The new columns are named <statistic>_per_minute and <statistic>_per_fight,
        where <statistic> is the name of the statistic.

        Args:
            None

        Returns:
            None
        """
        new_columns = {}

        for column in self.aggregated_fields:
            new_columns[column + "_per_minute"] = (
                self.data_aggregated[column]
                / self.data_aggregated["weighted_total_time"]
            )
            new_columns[column + "_per_fight"] = (
                self.data_aggregated[column]
                / self.data_aggregated["weighted_num_fight"]
            )

        self.data_aggregated = pd.concat(
            [self.data_aggregated, pd.DataFrame(new_columns)], axis=1
        ).copy()

    def normalize_data(self) -> None:
        """
        Normalize the aggregated data by dividing each column by its mean.

        This is done so that the data is more comparable between different fighters.
        The fields normalized are the ones in normalized_fields.

        Args:
            None

        Returns:
            None
        """
        data_normalized = self.data_aggregated.copy()

        logger.info(f"Fields to be normalized: {self.normalized_fields}")

        for column in self.normalized_fields:
            mean = self.data_aggregated[column].mean()
            data_normalized[column] = data_normalized[column] / mean

            self.normalization_factors[column] = mean

        self.data_normalized = data_normalized

`aggregated_fields: List[str]` `property`

The fields that are aggregated over the fighter's history.

This property returns all the statistic names, including the ones with "_opponent" appended to represent the opponent's statistics. It also returns the aggregated fields added by the data enhancers.

Returns:

Type	Description
`List[str]`	A list of strings, the names of the aggregated fields.

`normalized_fields: List[str]` `property`

The fields that are normalized over the fighter's history.

These fields are normalized in the sense that they are divided by their mean value in the history of the fighter. This is done to reduce the effect of outliers and to make the data more comparable between different fighters.

The fields normalized are: - "age" - "time_since_last_fight" - "fighter_height_cm" - "weight", - All the aggregated fields (see :meth:aggregated_fields), and the same fields with "_per_minute" and "_per_fight" appended, which represent the aggregated fields per minute and per fight, respectively.

It also returns the normalized fields added by the data enhancers.

Returns:

Type	Description
`List[str]`	A list of strings, the names of the normalized fields.

`round_stat_names: List[str]` `property`

The names of the round statistics.

This property returns the names of the columns in the rounds data that are not in ["fight_id", "fighter_id", "round"]. It also returns the same names with "_opponent" appended, to represent the opponent's statistics.

Returns:

Type	Description
`List[str]`	A list of strings, the names of the round statistics.

`stat_names: List[str]` `property`

The names of the statistics.

This property returns the names of the columns in the rounds data that are not in ["fight_id", "fighter_id", "round"]. It also returns the same names with "_opponent" appended, to represent the opponent's statistics, and the names of the columns "KO", "Sub" and "win", which are the result of the fight, with "_opponent" appended to represent the opponent's result.

Returns:

Type	Description
`List[str]`	A list of strings, the names of the statistics.

`init(data_folder=None, ufc_scraper=None, bfo_scraper=None, data_aggregator=None, data_enhancers=[])`

Constructor for DataProcessor.

Parameters:

Name	Type	Description	Default
`data_folder`	`Optional[Path \| str]`	The folder containing the data.	`None`
`ufc_scraper`	`Optional[UFCScraper]`	The scraper to use for ufc data.	`None`
`bfo_scraper`	`Optional[BestFightOddsScraper]`	The scraper to use for best fight odds data.	`None`
`data_aggregator`	`Optional[DataAggregator]`	The data aggregator to use for aggregating data.	`None`
`data_enhancers`	`List[DataEnhancer]`	The list of data enhancers to apply to the data.	`[]`

Raises:

Type	Description
`ValueError`	If data_folder is None and both ufc_scraper and bfo_scraper are None.

Source code in ufcpredictor/data_processor.py

def __init__(
    self,
    data_folder: Optional[Path | str] = None,
    ufc_scraper: Optional[UFCScraper] = None,
    bfo_scraper: Optional[BestFightOddsScraper] = None,
    data_aggregator: Optional[DataAggregator] = None,
    data_enhancers: List[DataEnhancer] = [],
) -> None:
    """
    Constructor for DataProcessor.

    Args:
        data_folder: The folder containing the data.
        ufc_scraper: The scraper to use for ufc data.
        bfo_scraper: The scraper to use for best fight odds data.
        data_aggregator: The data aggregator to use for aggregating data.
        data_enhancers: The list of data enhancers to apply to the data.

    Raises:
        ValueError: If data_folder is None and both ufc_scraper and
            bfo_scraper are None.
    """
    if data_folder is None and (ufc_scraper is None or bfo_scraper is None):
        raise ValueError(
            "If data_folder is None, both ufc_scraper and bfo_scraper "
            "should be provided"
        )

    self.scraper = ufc_scraper or UFCScraper(data_folder=data_folder)
    self.bfo_scraper = bfo_scraper or BestFightOddsScraper(
        data_folder=data_folder, n_sessions=-1
    )

    self.data_aggregator = data_aggregator or DefaultDataAggregator()
    self.data_enhancers = data_enhancers

`add_key_stats(data)` `staticmethod`

Add key statistics to the dataframe.

This function adds columns to the dataframe indicating whether a fighter has won a fight via KO, submission or decision, and whether the opponent has won a fight via KO, submission or decision. It also adds a column indicating the age of the fighter at the time of the fight.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	The dataframe to be processed.	required

Returns:

Type	Description
`DataFrame`	The dataframe with the added columns.

Source code in ufcpredictor/data_processor.py

@staticmethod
def add_key_stats(data: pd.DataFrame) -> pd.DataFrame:
    """
    Add key statistics to the dataframe.

    This function adds columns to the dataframe indicating whether a fighter
    has won a fight via KO, submission or decision, and whether the opponent
    has won a fight via KO, submission or decision. It also adds a column
    indicating the age of the fighter at the time of the fight.

    Args:
        data: The dataframe to be processed.

    Returns:
        The dataframe with the added columns.
    """
    #############################################
    # Add some missing stats
    # KO, Submission and Win
    #############################################
    # Whether fighter has KOd his opponent
    data["KO"] = np.where(
        (data["result"].str.contains("KO"))
        & (data["winner"] == data["fighter_id"]),
        1,
        0,
    )

    # Whether the fighter has been KOd by his opponent
    data["KO_opponent"] = np.where(
        (data["result"].str.contains("KO"))
        & (data["winner"] != data["fighter_id"]),
        1,
        0,
    )

    # Same for submission
    data["Sub"] = np.where(
        (data["result"].str.contains("Submission"))
        & (data["winner"] == data["fighter_id"]),
        1,
        0,
    )

    data["Sub_opponent"] = np.where(
        (data["result"].str.contains("Submission"))
        & (data["winner"] != data["fighter_id"]),
        1,
        0,
    )

    data["win"] = np.where(data["winner"] == data["fighter_id"], 1, 0)
    data["win_opponent"] = np.where(data["winner"] != data["fighter_id"], 1, 0)
    data["age"] = (data["event_date"] - data["fighter_dob"]).dt.days / 365

    return data

`add_per_minute_and_fight_stats()`

Add two new columns to the aggregated data for each statistic.

The first column is the statistic per minute, computed by dividing the statistic by the total time in the octagon. The second column is the statistic per fight, computed by dividing the statistic by the number of fights.

The new columns are named _per_minute and _per_fight, where is the name of the statistic.

Returns:

Type	Description
`None`	None

Source code in ufcpredictor/data_processor.py

def add_per_minute_and_fight_stats(self) -> None:
    """
    Add two new columns to the aggregated data for each statistic.

    The first column is the statistic per minute, computed by dividing the
    statistic by the total time in the octagon. The second column is the
    statistic per fight, computed by dividing the statistic by the number
    of fights.

    The new columns are named <statistic>_per_minute and <statistic>_per_fight,
    where <statistic> is the name of the statistic.

    Args:
        None

    Returns:
        None
    """
    new_columns = {}

    for column in self.aggregated_fields:
        new_columns[column + "_per_minute"] = (
            self.data_aggregated[column]
            / self.data_aggregated["weighted_total_time"]
        )
        new_columns[column + "_per_fight"] = (
            self.data_aggregated[column]
            / self.data_aggregated["weighted_num_fight"]
        )

    self.data_aggregated = pd.concat(
        [self.data_aggregated, pd.DataFrame(new_columns)], axis=1
    ).copy()

`aggregate_data()`

Aggregate the data by combining the round statistics over the history of the fighters.

The aggregated data is stored in the attribute data_aggregated.

The specific implementation depends on the DataAggregator used.

Source code in ufcpredictor/data_processor.py

def aggregate_data(self) -> None:
    """
    Aggregate the data by combining the round statistics over the history of the
    fighters.

    The aggregated data is stored in the attribute data_aggregated.

    The specific implementation depends on the DataAggregator used.
    """
    self.data_aggregated = self.data_aggregator.aggregate_data(self)

    for data_enhancer in self.data_enhancers:
        self.data_aggregated = data_enhancer.add_aggregated_fields(
            self.data_aggregated
        )

`apply_filters(data)` `staticmethod`

Apply filters to the dataframe.

This function applies filters to the dataframe to remove fights: - Before 2008, 8, 1, since I don't have odds for these - With non-standard fight formats (time_format not in ["3 Rnd (5-5-5)", "5 Rnd (5-5-5-5-5)"]) - With female fighters (gender not in ["M"]) - With disqualified or doctor's stoppage results (result not in ["Decision", "KO/TKO", "Submission"]) - With draws or invalid winners (winner not in ("Draw", "NC") or winner.isna())

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	The dataframe to be processed.	required

Returns:

Type	Description
`DataFrame`	The dataframe with the applied filters.

Source code in ufcpredictor/data_processor.py

@staticmethod
def apply_filters(data: pd.DataFrame) -> pd.DataFrame:
    """
    Apply filters to the dataframe.

    This function applies filters to the dataframe to remove fights:
    - Before 2008, 8, 1, since I don't have odds for these
    - With non-standard fight formats (time_format not in ["3 Rnd (5-5-5)", "5 Rnd (5-5-5-5-5)"])
    - With female fighters (gender not in ["M"])
    - With disqualified or doctor's stoppage results (result not in ["Decision", "KO/TKO", "Submission"])
    - With draws or invalid winners (winner not in ("Draw", "NC") or winner.isna())

    Args:
        data: The dataframe to be processed.

    Returns:
        The dataframe with the applied filters.
    """
    # Remove old fights since I don't have odds for these
    data = data[data["event_date"].dt.date >= datetime.date(2008, 8, 1)]

    # Remove non-standard fight format
    data = data[data["time_format"].isin(["3 Rnd (5-5-5)", "5 Rnd (5-5-5-5-5)"])]

    # Remove female fights
    data = data[data["gender"] == "M"]

    # Remove disqualified and doctor's stoppage
    data = data[data["result"].isin(["Decision", "KO/TKO", "Submission"])]

    # Remove draws and invalid and NC
    data = data[(~data["winner"].isin(["Draw", "NC"])) & (~data["winner"].isna())]

    return data

`convert_odds_to_decimal(data)` `staticmethod`

Convert odds from American format to decimal format.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	The dataframe with the odds in American format.	required

Returns:

Type	Description
`DataFrame`	The dataframe with the odds in decimal format.

Source code in ufcpredictor/data_processor.py

@staticmethod
def convert_odds_to_decimal(data: pd.DataFrame) -> pd.DataFrame:
    """
    Convert odds from American format to decimal format.

    Args:
        data: The dataframe with the odds in American format.

    Returns:
        The dataframe with the odds in decimal format.
    """
    for field in "opening", "closing_range_min", "closing_range_max":
        data[field] = data[field].astype(float)
        msk = data[field] > 0

        data.loc[msk, field] = data.loc[msk, field] / 100 + 1
        data.loc[~msk, field] = 100 / -data.loc[~msk, field] + 1

    return data

`fill_weight(data)` `staticmethod`

Fill the weight column using the weight_class column and the weight_dict.

The weight_dict is a dictionary mapping the weight classes to their corresponding weights in lbs. The weights are then filled in the weight column according to the weight classes in the weight_class column.

This function also removes rows with null weight classes, or open weight or catch weight (agreed weight outside a weight class).

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	The dataframe to be processed.	required

Returns:

Type	Description
`DataFrame`	The dataframe with the weight column filled.

Source code in ufcpredictor/data_processor.py

@staticmethod
def fill_weight(data: pd.DataFrame) -> pd.DataFrame:
    """
    Fill the weight column using the weight_class column and the weight_dict.

    The weight_dict is a dictionary mapping the weight classes to their
    corresponding weights in lbs. The weights are then filled in the weight
    column according to the weight classes in the weight_class column.

    This function also removes rows with null weight classes, or open weight
    or catch weight (agreed weight outside a weight class).

    Args:
        data: The dataframe to be processed.

    Returns:
        The dataframe with the weight column filled.
    """
    data.loc[:, "weight"] = data["weight_class"].map(weight_dict)

    ##################################################################################
    # Remove null weight classes, or open weight or catch weight (agreed weight outside a weight class)
    ##################################################################################
    data = data[
        (data["weight_class"] != "NULL")
        & (data["weight_class"] != "Catch Weight")
        & (data["weight_class"] != "Open Weight")
    ]

    return data

`fix_date_and_time_fields(data)` `staticmethod`

Fix date and time fields in the dataframe.

This function takes care of converting control time, finish time and total time from minutes to seconds. It also converts the event date and fighter date of birth to datetime objects.

The dataframe is then sorted by fighter id and event date.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	The dataframe to be processed.	required

Returns:

Type	Description
`DataFrame`	The dataframe with the fields fixed.

Source code in ufcpredictor/data_processor.py

@staticmethod
def fix_date_and_time_fields(data: pd.DataFrame) -> pd.DataFrame:
    """
    Fix date and time fields in the dataframe.

    This function takes care of converting control time, finish time
    and total time from minutes to seconds. It also converts the
    event date and fighter date of birth to datetime objects.

    The dataframe is then sorted by fighter id and event date.

    Args:
        data: The dataframe to be processed.

    Returns:
        The dataframe with the fields fixed.
    """
    data["ctrl_time"] = data["ctrl_time"].apply(convert_minutes_to_seconds)
    data["ctrl_time_opponent"] = data["ctrl_time_opponent"].apply(
        convert_minutes_to_seconds
    )
    data["finish_time"] = data["finish_time"].apply(convert_minutes_to_seconds)
    data["total_time"] = (data["finish_round"] - 1) * 5 * 60 + data["finish_time"]
    data["event_date"] = pd.to_datetime(data["event_date"])
    data["fighter_dob"] = pd.to_datetime(data["fighter_dob"])

    data = data.sort_values(by=["fighter_id", "event_date"])

    return data

`get_fighter_id(name)`

Returns the id of the fighter with the given name. Search is performed using fuzzywuzzy. If multiple matches are found, the first one is returned.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the fighter.	required

Returns:

Type	Description
`str`	The id of the fighter.

Source code in ufcpredictor/data_processor.py

def get_fighter_id(self, name: str) -> str:
    """
    Returns the id of the fighter with the given name.
    Search is performed using fuzzywuzzy.
    If multiple matches are found, the first one is returned.

    Args:
        name: The name of the fighter.

    Returns:
        The id of the fighter.
    """
    best_name, score = extractOne(name, self.fighter_ids.keys())

    if score < 100:
        logger.warning(
            f"Fighter found for {name} with {score}% accuracy: {best_name}"
        )
    return self.fighter_ids[best_name]

`get_fighter_name(id_)`

Returns the name of the fighter with the given id.

Args:cla id_: The id of the fighter.

Returns:

Type	Description
`str`	The name of the fighter.

Source code in ufcpredictor/data_processor.py

def get_fighter_name(self, id_: str) -> str:
    """
    Returns the name of the fighter with the given id.

    Args:cla
        id_: The id of the fighter.

    Returns:
        The name of the fighter.
    """
    return self.fighter_names[id_]

`group_round_data(data)`

Group the round data by the fixed fields and sum the round statistics.

The fixed fields are the columns in the data that are not in the round statistics and not in ["round"]. The round statistics are the columns in the data that are in the round statistics and not in ["round"].

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	The data to be grouped.	required

Returns:

Type	Description
`DataFrame`	The grouped data, with the round statistics summed.

Source code in ufcpredictor/data_processor.py

def group_round_data(self, data: pd.DataFrame) -> pd.DataFrame:
    """
    Group the round data by the fixed fields and sum the round statistics.

    The fixed fields are the columns in the data that are not in the round
    statistics and not in ["round"]. The round statistics are the columns
    in the data that are in the round statistics and not in ["round"].

    Args:
        data: The data to be grouped.

    Returns:
        The grouped data, with the round statistics summed.
    """
    fixed_fields = [
        c
        for c in data.columns
        if c
        not in self.round_stat_names
        + [
            "round",
        ]
    ]

    return (
        data.groupby(
            fixed_fields, dropna=False
        )  # Important to group nans as valid values.
        .sum()
        .reset_index()
        .drop("round", axis=1)
    ).sort_values(by=["fighter_id", "event_date"])

`join_dataframes()`

Joins all the relevant dataframes (fight, fighter, event, and odds).

It duplicates the current fight data to create two rows per match, one row for each fighter, and assigns fighter and opponent to each other. Then, it merges the fighter data, round data, and odds data to the previous table. Finally, it adds the date of the event to the dataframe.

Returns:

Type	Description
`DataFrame`	The joined dataframe.

Source code in ufcpredictor/data_processor.py

def join_dataframes(self) -> pd.DataFrame:
    """
    Joins all the relevant dataframes (fight, fighter, event, and odds).

    It duplicates the current fight data to create two rows per match,
    one row for each fighter, and assigns fighter and opponent to each other.
    Then, it merges the fighter data, round data, and odds data to the
    previous table. Finally, it adds the date of the event to the dataframe.

    Returns:
        The joined dataframe.
    """
    fight_data = self.scraper.fight_scraper.data
    round_data = self.scraper.fight_scraper.rounds_handler.data
    fighter_data = self.scraper.fighter_scraper.data
    event_data = self.scraper.event_scraper.data

    odds_data = self.bfo_scraper.data

    ###########################################################
    # I want to create two rows per match, one row for each fighter
    ###########################################################
    # Hence I need to duplicate the current fight data
    # Assigning fighter and opponent to each other
    data = pd.concat(
        [
            fight_data.rename(
                columns={
                    "fighter_1": "opponent_id",
                    "fighter_2": "fighter_id",
                    "scores_1": "opponent_score",
                    "scores_2": "fighter_score",
                }
            ),
            fight_data.rename(
                columns={
                    "fighter_2": "opponent_id",
                    "fighter_1": "fighter_id",
                    "scores_2": "opponent_score",
                    "scores_1": "fighter_score",
                }
            ),
        ]
    )

    # I am merging the fighter data to the previous table
    # This includes height, reach etc...
    fighter_data["fighter_name"] = (
        fighter_data["fighter_f_name"]
        + " "
        + fighter_data["fighter_l_name"].fillna("")
    )
    data = data.merge(
        fighter_data,  # [fighter_fields],
        on="fighter_id",
        how="left",
    )

    data = data.merge(
        fighter_data[["fighter_id", "fighter_name", "fighter_nickname"]],
        left_on="opponent_id",
        right_on="fighter_id",
        how="left",
        suffixes=("", "_opponent"),
    )

    #############################################################
    # Add round data.
    #############################################################

    # Merging columns
    round_data = pd.merge(
        round_data,
        round_data,
        on=["fight_id", "round"],
        suffixes=("", "_opponent"),
    )

    # And then remove the match of the fighter with itself
    round_data = round_data[
        round_data["fighter_id"] != round_data["fighter_id_opponent"]
    ]

    data = data.merge(
        round_data,
        on=[
            "fight_id",
            "fighter_id",
            "fighter_id_opponent",
        ],
    )

    ##############################################################
    # Add odds data
    ###############################################################
    data = data.merge(
        odds_data,
        on=["fight_id", "fighter_id"],
    )

    # Add the date of the event to the dataframe
    data = data.merge(
        event_data[["event_id", "event_date"]],  # I only need the date for now,
        on="event_id",
    )

    return data

`load_data()`

Loads and processes all the data.

First, it joins all the relevant dataframes (fight, fighter, event, and odds). Then, it fixes the date and time fields, converts the odds to decimal format, fills the weight for each fighter (if not available), adds key statistics (KO, Submission, and Win), and applies filters to the data. Finally, it groups the round data by fighter and fight, and assigns the result to the data attribute.

This method should be called before any other method.

Source code in ufcpredictor/data_processor.py

def load_data(self) -> None:
    """
    Loads and processes all the data.

    First, it joins all the relevant dataframes (fight, fighter, event, and odds).
    Then, it fixes the date and time fields, converts the odds to decimal format,
    fills the weight for each fighter (if not available), adds key statistics
    (KO, Submission, and Win), and applies filters to the data.
    Finally, it groups the round data by fighter and fight, and assigns the result
    to the data attribute.

    This method should be called before any other method.
    """
    data = self.join_dataframes()
    data = self.fix_date_and_time_fields(data)
    data = self.convert_odds_to_decimal(data)
    data = self.fill_weight(data)
    data = self.add_key_stats(data)
    data = self.apply_filters(data)
    self.data = self.group_round_data(data)
    self.data["num_fight"] = self.data.groupby("fighter_id").cumcount() + 1

    for data_enhancer in self.data_enhancers:
        self.data = data_enhancer.add_data_fields(self.data)

    names = self.data["fighter_name"].values
    ids = self.data["fighter_id"].values

    self.fighter_names = {id_: name_ for id_, name_ in zip(ids, names)}
    self.fighter_ids = {name_: id_ for id_, name_ in zip(ids, names)}

`normalize_data()`

Normalize the aggregated data by dividing each column by its mean.

This is done so that the data is more comparable between different fighters. The fields normalized are the ones in normalized_fields.

Returns:

Type	Description
`None`	None

Source code in ufcpredictor/data_processor.py

def normalize_data(self) -> None:
    """
    Normalize the aggregated data by dividing each column by its mean.

    This is done so that the data is more comparable between different fighters.
    The fields normalized are the ones in normalized_fields.

    Args:
        None

    Returns:
        None
    """
    data_normalized = self.data_aggregated.copy()

    logger.info(f"Fields to be normalized: {self.normalized_fields}")

    for column in self.normalized_fields:
        mean = self.data_aggregated[column].mean()
        data_normalized[column] = data_normalized[column] / mean

        self.normalization_factors[column] = mean

    self.data_normalized = data_normalized

Datasets

This module contains dataset classes designed to handle UFC fight data for training and testing neural network models.

The dataset classes provide a structured way to store and retrieve data for fighter characteristics, fight outcomes, and odds. They are designed to work with the DataProcessor class to prepare and normalize the data.

`BasicDataset`

Bases: Dataset

A basic dataset class designed to handle UFC fight data for training and testing neural network models.

This class provides a simple way to store and retrieve data for fighter characteristics, fight outcomes, and odds. It is designed to be used with the SymmetricFightNet model and other UFC prediction models.

Source code in ufcpredictor/datasets.py

class BasicDataset(Dataset):
    """
    A basic dataset class designed to handle UFC fight data for training and testing
    neural network models.

    This class provides a simple way to store and retrieve data for fighter
    characteristics, fight outcomes, and odds. It is designed to be used with the
    SymmetricFightNet model and other UFC prediction models.
    """

    X_set = [
        "age",
        "body_strikes_att_opponent_per_minute",
        "body_strikes_att_per_minute",
        "body_strikes_succ_opponent_per_minute",
        "body_strikes_succ_per_minute",
        "clinch_strikes_att_opponent_per_minute",
        "clinch_strikes_att_per_minute",
        "clinch_strikes_succ_opponent_per_minute",
        "clinch_strikes_succ_per_minute",
        "ctrl_time_opponent_per_minute",
        "ctrl_time_per_minute",
        "distance_strikes_att_opponent_per_minute",
        "distance_strikes_att_per_minute",
        "distance_strikes_succ_opponent_per_minute",
        "distance_strikes_succ_per_minute",
        "fighter_height_cm",
        "ground_strikes_att_opponent_per_minute",
        "ground_strikes_att_per_minute",
        "ground_strikes_succ_opponent_per_minute",
        "ground_strikes_succ_per_minute",
        "head_strikes_att_opponent_per_minute",
        "head_strikes_att_per_minute",
        "head_strikes_succ_opponent_per_minute",
        "head_strikes_succ_per_minute",
        "knockdowns_opponent_per_minute",
        "knockdowns_per_minute",
        "KO_opponent_per_fight",
        "KO_opponent_per_minute",
        "KO_per_fight",
        "KO_per_minute",
        "leg_strikes_att_opponent_per_minute",
        "leg_strikes_att_per_minute",
        "leg_strikes_succ_opponent_per_minute",
        "leg_strikes_succ_per_minute",
        "num_fight",
        "reversals_opponent_per_minute",
        "reversals_per_minute",
        "strikes_att_opponent_per_minute",
        "strikes_att_per_minute",
        "strikes_succ_opponent_per_minute",
        "strikes_succ_per_minute",
        "Sub_opponent_per_fight",
        "Sub_opponent_per_minute",
        "Sub_per_fight",
        "Sub_per_minute",
        "submission_att_opponent_per_minute",
        "submission_att_per_minute",
        "takedown_att_opponent_per_minute",
        "takedown_att_per_minute",
        "takedown_succ_opponent_per_minute",
        "takedown_succ_per_minute",
        "time_since_last_fight",
        "total_strikes_att_opponent_per_minute",
        "total_strikes_att_per_minute",
        "total_strikes_succ_opponent_per_minute",
        "total_strikes_succ_per_minute",
        "win_opponent_per_fight",
        "win_per_fight",
    ]

    Xf_set: List[str] = []

    def __init__(
        self,
        data_processor: DataProcessor,
        fight_ids: List[str],
        X_set: Optional[List[str]] = None,
        Xf_set: Optional[List[str]] = None,
    ) -> None:
        """
        Constructor for ForecastDataset.

        Args:
            data_processor: The DataProcessor instance that contains the data.
            fight_ids: The list of fight ids to include in the dataset.
            X_set: The list of columns to include in the dataset. If None, use all
            columns.

        Raises:
            ValueError: If some columns are not found in the normalized data.
        """
        self.data_processor = data_processor
        self.fight_ids = fight_ids

        if X_set is not None:
            self.X_set = X_set

        if Xf_set is not None:
            self.Xf_set = Xf_set

        not_found = []
        for column in self.X_set + self.Xf_set:
            if column not in self.data_processor.data_normalized.columns:
                not_found.append(column)

        if len(not_found) > 0:
            raise ValueError(f"Columns not found in normalized data: {not_found}")

        self.load_data()

    def load_data(self) -> None:
        """
        Loads the data into a format that can be used to train a model.

        The data is first reduced to only include the columns specified in X_set.
        Then, the stats are shifted to get the stats prior to each fight.
        The data is then merged with itself to get one row per match with the data
        from the two fighters.
        The matchings of the fighter with itself are removed and only one row per
        match is kept.
        Finally, the data is loaded into torch tensors.
        """
        reduced_data = self.data_processor.data_normalized.copy()

        # We shift stats because the input for the model should be the
        # stats prior to the fight
        for x in self.X_set:
            if x not in ["age", "num_fight", "time_since_last_fight"]:
                reduced_data[x] = reduced_data.groupby("fighter_id")[x].shift(1)

        # We remove invalid fights
        reduced_data = reduced_data[reduced_data["fight_id"].isin(self.fight_ids)]

        # We now merge stats with itself to get one row per match with the data
        # from the two fighters
        fight_data = reduced_data.merge(
            reduced_data,
            left_on="fight_id",
            right_on="fight_id",
            how="inner",
            suffixes=("_x", "_y"),
        )

        # Remove matchings of the fighter with itself and also only keep
        # one row per match (fighter1 vs fighter2 is the same as fighter 2 vs fighter 1)
        fight_data = fight_data[
            fight_data["fighter_id_x"] != fight_data["fighter_id_y"]
        ]
        fight_data = fight_data.drop_duplicates(subset=["fight_id"], keep="first")

        # Now we load the data into torch tensors
        # This is a list of FloatTensors each having a size equal to the number
        # of fights.
        self.data: List[torch.Tensor] = [
            torch.FloatTensor(
                np.asarray([fight_data[x + "_x"].values for x in self.X_set]).T
            ),
            torch.FloatTensor(
                np.asarray([fight_data[x + "_y"].values for x in self.X_set]).T
            ),
            torch.FloatTensor(
                np.asarray([fight_data[xf + "_x"].values for xf in self.Xf_set]).T
            ),
            torch.FloatTensor(
                (fight_data["winner_x"] != fight_data["fighter_id_x"]).values
            ),
            torch.FloatTensor(fight_data["opening_x"].values),
            torch.FloatTensor(fight_data["opening_y"].values),
        ]

        if len(self.Xf_set) == 0:
            self.data[2] = torch.empty(len(fight_data["winner_x"]), 0)

        self.fight_data = fight_data

    def __len__(self) -> int:
        """Returns the size of the dataset.

        Returns:
            The size of the dataset.
        """
        return len(self.data[0])

    def __getitem__(self, idx: int) -> Tuple[
        torch.Tensor,
        torch.Tensor,
        torch.Tensor,
        torch.Tensor,
        torch.Tensor,
        torch.Tensor,
    ]:
        """
        Returns a tuple of (X, Y, winner, odds_1, odds_2) for the given index.

        The data is randomly flipped to simulate the possibility of a fight being
        between two fighters in either order.

        Args:
            idx: The index of the data to return.

        Returns:
            A tuple of (X, Y, winner, odds_1, odds_2) where X and Y are the
            input data for the two fighters, winner is a tensor of size 1
            indicating which fighter won, and odds_1 and odds_2 are the opening
            odds for the two fighters.
        """
        X1, X2, X3, winner, odds_1, odds_2 = [x[idx] for x in self.data]

        if np.random.random() >= 0.5:
            X1, X2 = X2, X1
            winner = 1 - winner
            odds_1, odds_2 = odds_2, odds_1

        return X1, X2, X3, winner.reshape(-1), odds_1.reshape(-1), odds_2.reshape(-1)

    def get_fight_data_from_ids(self, fight_ids: Optional[List[str]] = None) -> Tuple[
        torch.FloatTensor,
        torch.FloatTensor,
        torch.FloatTensor,
        torch.FloatTensor,
        torch.FloatTensor,
        torch.FloatTensor,
        NDArray[np.str_],
        NDArray[np.str_],
    ]:
        """
        Returns a tuple of (X, Y, winner, odds_1, odds_2, fighter_names, opponent_names)
        for the given fight ids.

        If fight_ids is None, returns all the data in the dataset.

        Args:
            fight_ids: The list of fight ids to include in the dataset. If None,
                use all the data in the dataset.

        Returns:
            A tuple of (X, Y, winner, odds_1, odds_2, fighter_names, opponent_names)
            where X and Y are the input data for the two fighters, winner is a tensor
            of size 1 indicating which fighter won, and odds_1 and odds_2 are the
            opening odds for the two fighters. fighter_names and opponent_names are
            the names of the fighters and their opponents.
        """
        if fight_ids is not None:
            fight_data = self.fight_data[self.fight_data["fight_id"].isin(fight_ids)]
        else:
            fight_data = self.fight_data.copy()

        data = [
            torch.FloatTensor(
                np.asarray([fight_data[x + "_x"].values for x in self.X_set]).T
            ),
            torch.FloatTensor(
                np.asarray([fight_data[x + "_y"].values for x in self.X_set]).T
            ),
            torch.FloatTensor(
                np.asarray([fight_data[x + "_x"].values for x in self.Xf_set]).T
            ),
            torch.FloatTensor(
                (fight_data["winner_x"] != fight_data["fighter_id_x"]).values
            ),
            torch.FloatTensor(fight_data["opening_x"].values),
            torch.FloatTensor(fight_data["opening_y"].values),
        ]

        fighter_names = np.array(fight_data["fighter_name_x"].values)
        opponent_names = np.array(fight_data["fighter_name_y"].values)

        X1, X2, X3, Y, odds1, odds2 = data

        return X1, X2, X3, Y, odds1, odds2, fighter_names, opponent_names

`getitem(idx)`

Returns a tuple of (X, Y, winner, odds_1, odds_2) for the given index.

The data is randomly flipped to simulate the possibility of a fight being between two fighters in either order.

Parameters:

Name	Type	Description	Default
`idx`	`int`	The index of the data to return.	required

Returns:

Type	Description
`Tensor`	A tuple of (X, Y, winner, odds_1, odds_2) where X and Y are the
`Tensor`	input data for the two fighters, winner is a tensor of size 1
`Tensor`	indicating which fighter won, and odds_1 and odds_2 are the opening
`Tensor`	odds for the two fighters.

Source code in ufcpredictor/datasets.py

def __getitem__(self, idx: int) -> Tuple[
    torch.Tensor,
    torch.Tensor,
    torch.Tensor,
    torch.Tensor,
    torch.Tensor,
    torch.Tensor,
]:
    """
    Returns a tuple of (X, Y, winner, odds_1, odds_2) for the given index.

    The data is randomly flipped to simulate the possibility of a fight being
    between two fighters in either order.

    Args:
        idx: The index of the data to return.

    Returns:
        A tuple of (X, Y, winner, odds_1, odds_2) where X and Y are the
        input data for the two fighters, winner is a tensor of size 1
        indicating which fighter won, and odds_1 and odds_2 are the opening
        odds for the two fighters.
    """
    X1, X2, X3, winner, odds_1, odds_2 = [x[idx] for x in self.data]

    if np.random.random() >= 0.5:
        X1, X2 = X2, X1
        winner = 1 - winner
        odds_1, odds_2 = odds_2, odds_1

    return X1, X2, X3, winner.reshape(-1), odds_1.reshape(-1), odds_2.reshape(-1)

`init(data_processor, fight_ids, X_set=None, Xf_set=None)`

Constructor for ForecastDataset.

Parameters:

Name	Type	Description	Default
`data_processor`	`DataProcessor`	The DataProcessor instance that contains the data.	required
`fight_ids`	`List[str]`	The list of fight ids to include in the dataset.	required
`X_set`	`Optional[List[str]]`	The list of columns to include in the dataset. If None, use all	`None`

Raises:

Type	Description
`ValueError`	If some columns are not found in the normalized data.

Source code in ufcpredictor/datasets.py

def __init__(
    self,
    data_processor: DataProcessor,
    fight_ids: List[str],
    X_set: Optional[List[str]] = None,
    Xf_set: Optional[List[str]] = None,
) -> None:
    """
    Constructor for ForecastDataset.

    Args:
        data_processor: The DataProcessor instance that contains the data.
        fight_ids: The list of fight ids to include in the dataset.
        X_set: The list of columns to include in the dataset. If None, use all
        columns.

    Raises:
        ValueError: If some columns are not found in the normalized data.
    """
    self.data_processor = data_processor
    self.fight_ids = fight_ids

    if X_set is not None:
        self.X_set = X_set

    if Xf_set is not None:
        self.Xf_set = Xf_set

    not_found = []
    for column in self.X_set + self.Xf_set:
        if column not in self.data_processor.data_normalized.columns:
            not_found.append(column)

    if len(not_found) > 0:
        raise ValueError(f"Columns not found in normalized data: {not_found}")

    self.load_data()

`len()`

Returns the size of the dataset.

Returns:

Type	Description
`int`	The size of the dataset.

Source code in ufcpredictor/datasets.py

def __len__(self) -> int:
    """Returns the size of the dataset.

    Returns:
        The size of the dataset.
    """
    return len(self.data[0])

`get_fight_data_from_ids(fight_ids=None)`

Returns a tuple of (X, Y, winner, odds_1, odds_2, fighter_names, opponent_names) for the given fight ids.

If fight_ids is None, returns all the data in the dataset.

Parameters:

Name	Type	Description	Default
`fight_ids`	`Optional[List[str]]`	The list of fight ids to include in the dataset. If None, use all the data in the dataset.	`None`

Returns:

Type	Description
`FloatTensor`	A tuple of (X, Y, winner, odds_1, odds_2, fighter_names, opponent_names)
`FloatTensor`	where X and Y are the input data for the two fighters, winner is a tensor
`FloatTensor`	of size 1 indicating which fighter won, and odds_1 and odds_2 are the
`FloatTensor`	opening odds for the two fighters. fighter_names and opponent_names are
`FloatTensor`	the names of the fighters and their opponents.

Source code in ufcpredictor/datasets.py

def get_fight_data_from_ids(self, fight_ids: Optional[List[str]] = None) -> Tuple[
    torch.FloatTensor,
    torch.FloatTensor,
    torch.FloatTensor,
    torch.FloatTensor,
    torch.FloatTensor,
    torch.FloatTensor,
    NDArray[np.str_],
    NDArray[np.str_],
]:
    """
    Returns a tuple of (X, Y, winner, odds_1, odds_2, fighter_names, opponent_names)
    for the given fight ids.

    If fight_ids is None, returns all the data in the dataset.

    Args:
        fight_ids: The list of fight ids to include in the dataset. If None,
            use all the data in the dataset.

    Returns:
        A tuple of (X, Y, winner, odds_1, odds_2, fighter_names, opponent_names)
        where X and Y are the input data for the two fighters, winner is a tensor
        of size 1 indicating which fighter won, and odds_1 and odds_2 are the
        opening odds for the two fighters. fighter_names and opponent_names are
        the names of the fighters and their opponents.
    """
    if fight_ids is not None:
        fight_data = self.fight_data[self.fight_data["fight_id"].isin(fight_ids)]
    else:
        fight_data = self.fight_data.copy()

    data = [
        torch.FloatTensor(
            np.asarray([fight_data[x + "_x"].values for x in self.X_set]).T
        ),
        torch.FloatTensor(
            np.asarray([fight_data[x + "_y"].values for x in self.X_set]).T
        ),
        torch.FloatTensor(
            np.asarray([fight_data[x + "_x"].values for x in self.Xf_set]).T
        ),
        torch.FloatTensor(
            (fight_data["winner_x"] != fight_data["fighter_id_x"]).values
        ),
        torch.FloatTensor(fight_data["opening_x"].values),
        torch.FloatTensor(fight_data["opening_y"].values),
    ]

    fighter_names = np.array(fight_data["fighter_name_x"].values)
    opponent_names = np.array(fight_data["fighter_name_y"].values)

    X1, X2, X3, Y, odds1, odds2 = data

    return X1, X2, X3, Y, odds1, odds2, fighter_names, opponent_names

`load_data()`

Loads the data into a format that can be used to train a model.

The data is first reduced to only include the columns specified in X_set. Then, the stats are shifted to get the stats prior to each fight. The data is then merged with itself to get one row per match with the data from the two fighters. The matchings of the fighter with itself are removed and only one row per match is kept. Finally, the data is loaded into torch tensors.

Source code in ufcpredictor/datasets.py

def load_data(self) -> None:
    """
    Loads the data into a format that can be used to train a model.

    The data is first reduced to only include the columns specified in X_set.
    Then, the stats are shifted to get the stats prior to each fight.
    The data is then merged with itself to get one row per match with the data
    from the two fighters.
    The matchings of the fighter with itself are removed and only one row per
    match is kept.
    Finally, the data is loaded into torch tensors.
    """
    reduced_data = self.data_processor.data_normalized.copy()

    # We shift stats because the input for the model should be the
    # stats prior to the fight
    for x in self.X_set:
        if x not in ["age", "num_fight", "time_since_last_fight"]:
            reduced_data[x] = reduced_data.groupby("fighter_id")[x].shift(1)

    # We remove invalid fights
    reduced_data = reduced_data[reduced_data["fight_id"].isin(self.fight_ids)]

    # We now merge stats with itself to get one row per match with the data
    # from the two fighters
    fight_data = reduced_data.merge(
        reduced_data,
        left_on="fight_id",
        right_on="fight_id",
        how="inner",
        suffixes=("_x", "_y"),
    )

    # Remove matchings of the fighter with itself and also only keep
    # one row per match (fighter1 vs fighter2 is the same as fighter 2 vs fighter 1)
    fight_data = fight_data[
        fight_data["fighter_id_x"] != fight_data["fighter_id_y"]
    ]
    fight_data = fight_data.drop_duplicates(subset=["fight_id"], keep="first")

    # Now we load the data into torch tensors
    # This is a list of FloatTensors each having a size equal to the number
    # of fights.
    self.data: List[torch.Tensor] = [
        torch.FloatTensor(
            np.asarray([fight_data[x + "_x"].values for x in self.X_set]).T
        ),
        torch.FloatTensor(
            np.asarray([fight_data[x + "_y"].values for x in self.X_set]).T
        ),
        torch.FloatTensor(
            np.asarray([fight_data[xf + "_x"].values for xf in self.Xf_set]).T
        ),
        torch.FloatTensor(
            (fight_data["winner_x"] != fight_data["fighter_id_x"]).values
        ),
        torch.FloatTensor(fight_data["opening_x"].values),
        torch.FloatTensor(fight_data["opening_y"].values),
    ]

    if len(self.Xf_set) == 0:
        self.data[2] = torch.empty(len(fight_data["winner_x"]), 0)

    self.fight_data = fight_data

`ForecastDataset`

Bases: Dataset

A dataset class designed to handle forecasting data for UFC fight predictions.

This class provides a structured way to store and retrieve data for training and testing neural network models. It is designed to work with the DataProcessor class to prepare and normalize the data.

Source code in ufcpredictor/datasets.py

class ForecastDataset(Dataset):
    """
    A dataset class designed to handle forecasting data for UFC fight predictions.

    This class provides a structured way to store and retrieve data for training and
    testing neural network models. It is designed to work with the DataProcessor class
    to prepare and normalize the data.
    """

    X_set = BasicDataset.X_set
    Xf_set = BasicDataset.Xf_set

    def __init__(
        self,
        data_processor: DataProcessor,
        X_set: Optional[List[str]] = None,
        Xf_set: Optional[List[str]] = None,
    ) -> None:
        """
        Constructor for ForecastDataset.

        Args:
            data_processor: The DataProcessor instance that contains the data.
            X_set: The list of columns to include in the dataset. If None, use all
                columns.

        Raises:
            ValueError: If some columns are not found in the normalized data.
        """
        self.data_processor = data_processor

        if X_set is not None:
            self.X_set = X_set

        if Xf_set is not None:
            self.Xf_set = Xf_set

        not_found = []
        for column in self.X_set + self.Xf_set:
            if column not in self.data_processor.data_normalized.columns:
                not_found.append(column)

        if len(not_found) > 0:
            raise ValueError(f"Columns not found in normalized data: {not_found}")

    def get_single_forecast_prediction(
        self,
        fighter_name: str,
        opponent_name: str,
        event_date: str | datetime.date,
        odds1: int,
        odds2: int,
        model: nn.Module,
        fight_features: List[float] = [],
        parse_ids: bool = False,
    ) -> Tuple[float, float]:
        """
        Make a prediction for a single match. Either providing the names of the
        fighters and their opponents, or providing the ids of the fighters and
        their opponents.

        Args:
            fighter_name: The name of the fighter.
            opponent_name: The name of the opponent.
            event_date: The date of the fight.
            odds1: The odds of the first fighter.
            odds2: The odds of the second fighter.
            model: The model to make the prediction with.
            parse_ids: Whether to parse the ids of the fighters and opponents. Ids
                are parsed in fields "fighter_name" and "opponent_name"if True,
                and names are parsed if False.

        Returns: The predicted odds for the first and second fighters.
        """
        p1, p2 = self.get_forecast_prediction(
            [
                fighter_name,
            ],
            [
                opponent_name,
            ],
            [
                event_date,
            ],
            [
                odds1,
            ],
            [
                odds2,
            ],
            model=model,
            fight_features=[
                fight_features,
            ],
            parse_ids=parse_ids,
        )

        return p1[0][0], p2[0][0]

    def get_forecast_prediction(
        self,
        fighter_names: List[str],
        opponent_names: List[str],
        event_dates: List[str | datetime.date],
        fighter_odds: List[float],
        opponent_odds: List[float],
        model: nn.Module,
        fight_features: List[List[float]] = [],
        parse_ids: bool = False,
        device: str = "cpu",
    ) -> Tuple[NDArray, NDArray]:
        """
        Make a prediction for a given list of matches. Either providing the names of
        the fighters and their opponents, or providing the ids of the fighters and
        their opponents.

        Args:
            fighters_names: The list of fighters names.
            opponent_names: The list of opponent names.
            event_dates: The list of event dates.
            fighter_odds: The list of fighter odds.
            opponent_odds: The list of opponent odds.
            model: The model to make the prediction with.
            parse_ids: Whether to parse the ids of the fighters and opponents. Ids
                are parsed in fields "fighter_names" and "opponent_names"if True,
                and names are parsed if False.
            device: The device to use for the prediction.

        Returns:
            A tuple of two numpy arrays, each containing the predictions for one of the
            fighters.
        """
        if not parse_ids:
            fighter_ids = [self.data_processor.get_fighter_id(x) for x in fighter_names]
            opponent_ids = [
                self.data_processor.get_fighter_id(x) for x in opponent_names
            ]
        else:
            fighter_ids = fighter_names
            opponent_ids = opponent_names

        match_data = pd.DataFrame(
            {
                "fighter_id": fighter_ids + opponent_ids,
                "event_date_forecast": event_dates * 2,
                "opening": np.concatenate((fighter_odds, opponent_odds)),
            }
        )

        for feature_name, stats in zip(self.Xf_set, np.asarray(fight_features).T):
            match_data[feature_name] = np.concatenate((stats, stats))

        match_data = match_data.merge(
            self.data_processor.data_normalized,
            left_on="fighter_id",
            right_on="fighter_id",
        )

        match_data = match_data[
            match_data["event_date"] < match_data["event_date_forecast"]
        ]
        match_data = match_data.sort_values(
            by=["fighter_id", "event_date"],
            ascending=[True, False],
        )
        match_data = match_data.drop_duplicates(
            subset=["fighter_id", "event_date_forecast"],
            keep="first",
        )
        match_data["id_"] = (
            match_data["fighter_id"].astype(str)
            + "_"
            + match_data["event_date_forecast"].astype(str)
        )

        match_data = match_data.rename(
            columns={
                "weight_x": "weight",
            }
        )

        ###############################################################
        # Now we need to fix some fields to adapt them to the match to
        # be predicted, since we are modifying the last line we are
        # modifying on top of the last fight.
        ###############################################################
        # Add time_since_last_fight information
        match_data["event_date_forecast"] = pd.to_datetime(
            match_data["event_date_forecast"]
        )
        match_data["time_since_last_fight"] = (
            match_data["event_date_forecast"] - match_data["event_date"]
        ).dt.days

        match_data["age"] = (
            match_data["event_date_forecast"] - match_data["fighter_dob"]
        ).dt.days / 365
        match_data["num_fight"] = match_data["num_fight"] + 1

        new_fields = ["age", "time_since_last_fight"] + self.Xf_set
        # Now we iterate over enhancers, in case it is a RankedField
        # We need to pass the appropriate fields to rank them.
        fields = []
        exponents = []
        for data_enhancer in self.data_processor.data_enhancers:
            if isinstance(data_enhancer, RankedFields):
                for field, exponent in zip(
                    data_enhancer.fields, data_enhancer.exponents
                ):
                    if field in new_fields:
                        exponents.append(exponent)
                        fields.append(field)

        # If there are fields to be ranked, we do so
        if len(fields) > 0:
            ranked_fields = RankedFields(fields, exponents)

            original_df = self.data_processor.data[
                [field + "not_ranked" for field in fields]
            ].rename(columns={field + "not_ranked": field for field in fields})

            match_data[fields] = ranked_fields.add_data_fields(
                pd.concat([original_df, match_data[fields]])
            ).iloc[len(self.data_processor.data) :][fields]

        # Now we will normalize the fields that need to be normalized.
        for field in new_fields:
            if field in self.data_processor.normalization_factors.keys():
                match_data[field] /= self.data_processor.normalization_factors[field]
        ###############################################################
        # Now we start building the tensor to input to the model
        ###############################################################
        # This data dict is used to facilitate the construction of the tensors
        data_dict = {
            id_: data
            for id_, data in zip(
                match_data["id_"].values,
                np.asarray([match_data[x] for x in self.X_set]).T,
            )
        }

        for feature_name, stats in zip(self.Xf_set, np.asarray(fight_features).T):
            match_data[feature_name] = np.concatenate((stats, stats))

        if len(self.Xf_set) > 0:
            fight_data_dict = {
                id_: data
                for id_, data in zip(
                    match_data["id_"].values,
                    np.asarray([match_data[x] for x in self.Xf_set]).T,
                )
            }
        else:
            fight_data_dict = {id_: [] for id_ in match_data["id_"].values}

        data = [
            torch.FloatTensor(
                np.asarray(
                    [
                        data_dict[fighter_id + "_" + str(event_date)]
                        for fighter_id, event_date in zip(fighter_ids, event_dates)
                    ]
                )
            ),  # X1
            torch.FloatTensor(
                np.asarray(
                    [
                        data_dict[fighter_id + "_" + str(event_date)]
                        for fighter_id, event_date in zip(opponent_ids, event_dates)
                    ]
                )
            ),  # X2
            torch.FloatTensor(
                np.asarray(
                    [
                        fight_data_dict[fighter_id + "_" + str(event_date)]
                        for fighter_id, event_date in zip(fighter_ids, event_dates)
                    ]
                )
            ),  # X3
            torch.FloatTensor(np.asarray(fighter_odds)).reshape(-1, 1),  # Odds1,
            torch.FloatTensor(np.asarray(opponent_odds)).reshape(-1, 1),  # Odds2
        ]

        X1, X2, X3, odds1, odds2 = data
        X1, X2, X3, odds1, odds2, model = (
            X1.to(device),
            X2.to(device),
            X3.to(device),
            odds1.to(device),
            odds2.to(device),
            model.to(device),
        )
        model.eval()
        with torch.no_grad():
            predictions_1 = model(X1, X2, X3, odds1, odds2).detach().cpu().numpy()
            predictions_2 = 1 - model(X2, X1, X3, odds2, odds1).detach().cpu().numpy()

        return predictions_1, predictions_2

`init(data_processor, X_set=None, Xf_set=None)`

Constructor for ForecastDataset.

Parameters:

Name	Type	Description	Default
`data_processor`	`DataProcessor`	The DataProcessor instance that contains the data.	required
`X_set`	`Optional[List[str]]`	The list of columns to include in the dataset. If None, use all columns.	`None`

Raises:

Type	Description
`ValueError`	If some columns are not found in the normalized data.

Source code in ufcpredictor/datasets.py

def __init__(
    self,
    data_processor: DataProcessor,
    X_set: Optional[List[str]] = None,
    Xf_set: Optional[List[str]] = None,
) -> None:
    """
    Constructor for ForecastDataset.

    Args:
        data_processor: The DataProcessor instance that contains the data.
        X_set: The list of columns to include in the dataset. If None, use all
            columns.

    Raises:
        ValueError: If some columns are not found in the normalized data.
    """
    self.data_processor = data_processor

    if X_set is not None:
        self.X_set = X_set

    if Xf_set is not None:
        self.Xf_set = Xf_set

    not_found = []
    for column in self.X_set + self.Xf_set:
        if column not in self.data_processor.data_normalized.columns:
            not_found.append(column)

    if len(not_found) > 0:
        raise ValueError(f"Columns not found in normalized data: {not_found}")

`get_forecast_prediction(fighter_names, opponent_names, event_dates, fighter_odds, opponent_odds, model, fight_features=[], parse_ids=False, device='cpu')`

Make a prediction for a given list of matches. Either providing the names of the fighters and their opponents, or providing the ids of the fighters and their opponents.

Parameters:

Name	Type	Description	Default
`fighters_names`		The list of fighters names.	required
`opponent_names`	`List[str]`	The list of opponent names.	required
`event_dates`	`List[str \| date]`	The list of event dates.	required
`fighter_odds`	`List[float]`	The list of fighter odds.	required
`opponent_odds`	`List[float]`	The list of opponent odds.	required
`model`	`Module`	The model to make the prediction with.	required
`parse_ids`	`bool`	Whether to parse the ids of the fighters and opponents. Ids are parsed in fields "fighter_names" and "opponent_names"if True, and names are parsed if False.	`False`
`device`	`str`	The device to use for the prediction.	`'cpu'`

Returns:

Type	Description
`NDArray`	A tuple of two numpy arrays, each containing the predictions for one of the
`NDArray`	fighters.

Source code in ufcpredictor/datasets.py

def get_forecast_prediction(
    self,
    fighter_names: List[str],
    opponent_names: List[str],
    event_dates: List[str | datetime.date],
    fighter_odds: List[float],
    opponent_odds: List[float],
    model: nn.Module,
    fight_features: List[List[float]] = [],
    parse_ids: bool = False,
    device: str = "cpu",
) -> Tuple[NDArray, NDArray]:
    """
    Make a prediction for a given list of matches. Either providing the names of
    the fighters and their opponents, or providing the ids of the fighters and
    their opponents.

    Args:
        fighters_names: The list of fighters names.
        opponent_names: The list of opponent names.
        event_dates: The list of event dates.
        fighter_odds: The list of fighter odds.
        opponent_odds: The list of opponent odds.
        model: The model to make the prediction with.
        parse_ids: Whether to parse the ids of the fighters and opponents. Ids
            are parsed in fields "fighter_names" and "opponent_names"if True,
            and names are parsed if False.
        device: The device to use for the prediction.

    Returns:
        A tuple of two numpy arrays, each containing the predictions for one of the
        fighters.
    """
    if not parse_ids:
        fighter_ids = [self.data_processor.get_fighter_id(x) for x in fighter_names]
        opponent_ids = [
            self.data_processor.get_fighter_id(x) for x in opponent_names
        ]
    else:
        fighter_ids = fighter_names
        opponent_ids = opponent_names

    match_data = pd.DataFrame(
        {
            "fighter_id": fighter_ids + opponent_ids,
            "event_date_forecast": event_dates * 2,
            "opening": np.concatenate((fighter_odds, opponent_odds)),
        }
    )

    for feature_name, stats in zip(self.Xf_set, np.asarray(fight_features).T):
        match_data[feature_name] = np.concatenate((stats, stats))

    match_data = match_data.merge(
        self.data_processor.data_normalized,
        left_on="fighter_id",
        right_on="fighter_id",
    )

    match_data = match_data[
        match_data["event_date"] < match_data["event_date_forecast"]
    ]
    match_data = match_data.sort_values(
        by=["fighter_id", "event_date"],
        ascending=[True, False],
    )
    match_data = match_data.drop_duplicates(
        subset=["fighter_id", "event_date_forecast"],
        keep="first",
    )
    match_data["id_"] = (
        match_data["fighter_id"].astype(str)
        + "_"
        + match_data["event_date_forecast"].astype(str)
    )

    match_data = match_data.rename(
        columns={
            "weight_x": "weight",
        }
    )

    ###############################################################
    # Now we need to fix some fields to adapt them to the match to
    # be predicted, since we are modifying the last line we are
    # modifying on top of the last fight.
    ###############################################################
    # Add time_since_last_fight information
    match_data["event_date_forecast"] = pd.to_datetime(
        match_data["event_date_forecast"]
    )
    match_data["time_since_last_fight"] = (
        match_data["event_date_forecast"] - match_data["event_date"]
    ).dt.days

    match_data["age"] = (
        match_data["event_date_forecast"] - match_data["fighter_dob"]
    ).dt.days / 365
    match_data["num_fight"] = match_data["num_fight"] + 1

    new_fields = ["age", "time_since_last_fight"] + self.Xf_set
    # Now we iterate over enhancers, in case it is a RankedField
    # We need to pass the appropriate fields to rank them.
    fields = []
    exponents = []
    for data_enhancer in self.data_processor.data_enhancers:
        if isinstance(data_enhancer, RankedFields):
            for field, exponent in zip(
                data_enhancer.fields, data_enhancer.exponents
            ):
                if field in new_fields:
                    exponents.append(exponent)
                    fields.append(field)

    # If there are fields to be ranked, we do so
    if len(fields) > 0:
        ranked_fields = RankedFields(fields, exponents)

        original_df = self.data_processor.data[
            [field + "not_ranked" for field in fields]
        ].rename(columns={field + "not_ranked": field for field in fields})

        match_data[fields] = ranked_fields.add_data_fields(
            pd.concat([original_df, match_data[fields]])
        ).iloc[len(self.data_processor.data) :][fields]

    # Now we will normalize the fields that need to be normalized.
    for field in new_fields:
        if field in self.data_processor.normalization_factors.keys():
            match_data[field] /= self.data_processor.normalization_factors[field]
    ###############################################################
    # Now we start building the tensor to input to the model
    ###############################################################
    # This data dict is used to facilitate the construction of the tensors
    data_dict = {
        id_: data
        for id_, data in zip(
            match_data["id_"].values,
            np.asarray([match_data[x] for x in self.X_set]).T,
        )
    }

    for feature_name, stats in zip(self.Xf_set, np.asarray(fight_features).T):
        match_data[feature_name] = np.concatenate((stats, stats))

    if len(self.Xf_set) > 0:
        fight_data_dict = {
            id_: data
            for id_, data in zip(
                match_data["id_"].values,
                np.asarray([match_data[x] for x in self.Xf_set]).T,
            )
        }
    else:
        fight_data_dict = {id_: [] for id_ in match_data["id_"].values}

    data = [
        torch.FloatTensor(
            np.asarray(
                [
                    data_dict[fighter_id + "_" + str(event_date)]
                    for fighter_id, event_date in zip(fighter_ids, event_dates)
                ]
            )
        ),  # X1
        torch.FloatTensor(
            np.asarray(
                [
                    data_dict[fighter_id + "_" + str(event_date)]
                    for fighter_id, event_date in zip(opponent_ids, event_dates)
                ]
            )
        ),  # X2
        torch.FloatTensor(
            np.asarray(
                [
                    fight_data_dict[fighter_id + "_" + str(event_date)]
                    for fighter_id, event_date in zip(fighter_ids, event_dates)
                ]
            )
        ),  # X3
        torch.FloatTensor(np.asarray(fighter_odds)).reshape(-1, 1),  # Odds1,
        torch.FloatTensor(np.asarray(opponent_odds)).reshape(-1, 1),  # Odds2
    ]

    X1, X2, X3, odds1, odds2 = data
    X1, X2, X3, odds1, odds2, model = (
        X1.to(device),
        X2.to(device),
        X3.to(device),
        odds1.to(device),
        odds2.to(device),
        model.to(device),
    )
    model.eval()
    with torch.no_grad():
        predictions_1 = model(X1, X2, X3, odds1, odds2).detach().cpu().numpy()
        predictions_2 = 1 - model(X2, X1, X3, odds2, odds1).detach().cpu().numpy()

    return predictions_1, predictions_2

`get_single_forecast_prediction(fighter_name, opponent_name, event_date, odds1, odds2, model, fight_features=[], parse_ids=False)`

Make a prediction for a single match. Either providing the names of the fighters and their opponents, or providing the ids of the fighters and their opponents.

Parameters:

Name	Type	Description	Default
`fighter_name`	`str`	The name of the fighter.	required
`opponent_name`	`str`	The name of the opponent.	required
`event_date`	`str \| date`	The date of the fight.	required
`odds1`	`int`	The odds of the first fighter.	required
`odds2`	`int`	The odds of the second fighter.	required
`model`	`Module`	The model to make the prediction with.	required
`parse_ids`	`bool`	Whether to parse the ids of the fighters and opponents. Ids are parsed in fields "fighter_name" and "opponent_name"if True, and names are parsed if False.	`False`

Returns: The predicted odds for the first and second fighters.

Source code in ufcpredictor/datasets.py

def get_single_forecast_prediction(
    self,
    fighter_name: str,
    opponent_name: str,
    event_date: str | datetime.date,
    odds1: int,
    odds2: int,
    model: nn.Module,
    fight_features: List[float] = [],
    parse_ids: bool = False,
) -> Tuple[float, float]:
    """
    Make a prediction for a single match. Either providing the names of the
    fighters and their opponents, or providing the ids of the fighters and
    their opponents.

    Args:
        fighter_name: The name of the fighter.
        opponent_name: The name of the opponent.
        event_date: The date of the fight.
        odds1: The odds of the first fighter.
        odds2: The odds of the second fighter.
        model: The model to make the prediction with.
        parse_ids: Whether to parse the ids of the fighters and opponents. Ids
            are parsed in fields "fighter_name" and "opponent_name"if True,
            and names are parsed if False.

    Returns: The predicted odds for the first and second fighters.
    """
    p1, p2 = self.get_forecast_prediction(
        [
            fighter_name,
        ],
        [
            opponent_name,
        ],
        [
            event_date,
        ],
        [
            odds1,
        ],
        [
            odds2,
        ],
        model=model,
        fight_features=[
            fight_features,
        ],
        parse_ids=parse_ids,
    )

    return p1[0][0], p2[0][0]

Loss Functions

This module contains loss functions designed to train neural network models to predict the outcome of UFC fights.

The loss functions take into account the predictions made by the model and the actual outcomes of the fights, and are used to optimize the model's performance.

`BettingLoss`

Bases: Module

Source code in ufcpredictor/loss_functions.py

class BettingLoss(nn.Module):

    mlflow_params: List[str] = [
        "max_bet",
    ]

    def __init__(self, max_bet: float = 10) -> None:
        """
        Initializes the BettingLoss instance.

        This function calls the constructor of the parent class and performs no
        other actions.
        """
        super(BettingLoss, self).__init__()
        self.max_bet = max_bet

    def get_bet(self, prediction: torch.Tensor | float) -> torch.Tensor | float:
        """
        Computes the bet for the given prediction.

        This function takes a prediction between 0 and 1 and returns the
        corresponding bet between 0 and 20. The bet is computed as the
        prediction times 2 times 10. This is just an approximation
        that seems to work well.

        Args:
            prediction: A tensor or float between 0 and 1 representing a
                prediction.

        Returns:
            A tensor or float between 0 and 20 representing the bet.
        """
        return prediction * 2 * self.max_bet

    def forward(
        self,
        predictions: torch.Tensor,
        targets: torch.Tensor,
        odds_1: torch.Tensor,
        odds_2: torch.Tensor,
    ) -> torch.Tensor:
        """
        Computes the betting loss for the given predictions and targets.

        This function takes a tensor of predictions between 0 and 1, a tensor of
        targets (0 or 1), and two tensors of odds. It returns a tensor with the
        computed betting loss, which is the mean of the losses minus the earnings,
        this is the net profit.

        The betting loss returned is the negative profit.

        Args:
            predictions: A tensor of predictions between 0 and 1.
            targets: A tensor of targets (0 or 1).
            odds_1: A tensor of odds for fighter 1.
            odds_2: A tensor of odds for fighter 2.

        Returns:
            A tensor with the computed betting loss.
        """
        msk = torch.round(predictions) == targets

        return_fighter_1 = self.get_bet(0.5 - predictions) * odds_1
        return_fighter_2 = self.get_bet(predictions - 0.5) * odds_2

        losses = torch.where(
            torch.round(predictions) == 0,
            self.get_bet(0.5 - predictions),
            self.get_bet(predictions - 0.5),
        )

        earnings = torch.zeros_like(losses)
        earnings[msk & (targets == 0)] = return_fighter_1[msk & (targets == 0)]
        earnings[msk & (targets == 1)] = return_fighter_2[msk & (targets == 1)]

        return (losses - earnings).mean()

`init(max_bet=10)`

Initializes the BettingLoss instance.

This function calls the constructor of the parent class and performs no other actions.

Source code in ufcpredictor/loss_functions.py

def __init__(self, max_bet: float = 10) -> None:
    """
    Initializes the BettingLoss instance.

    This function calls the constructor of the parent class and performs no
    other actions.
    """
    super(BettingLoss, self).__init__()
    self.max_bet = max_bet

`forward(predictions, targets, odds_1, odds_2)`

Computes the betting loss for the given predictions and targets.

This function takes a tensor of predictions between 0 and 1, a tensor of targets (0 or 1), and two tensors of odds. It returns a tensor with the computed betting loss, which is the mean of the losses minus the earnings, this is the net profit.

The betting loss returned is the negative profit.

Parameters:

Name	Type	Description	Default
`predictions`	`Tensor`	A tensor of predictions between 0 and 1.	required
`targets`	`Tensor`	A tensor of targets (0 or 1).	required
`odds_1`	`Tensor`	A tensor of odds for fighter 1.	required
`odds_2`	`Tensor`	A tensor of odds for fighter 2.	required

Returns:

Type	Description
`Tensor`	A tensor with the computed betting loss.

Source code in ufcpredictor/loss_functions.py

def forward(
    self,
    predictions: torch.Tensor,
    targets: torch.Tensor,
    odds_1: torch.Tensor,
    odds_2: torch.Tensor,
) -> torch.Tensor:
    """
    Computes the betting loss for the given predictions and targets.

    This function takes a tensor of predictions between 0 and 1, a tensor of
    targets (0 or 1), and two tensors of odds. It returns a tensor with the
    computed betting loss, which is the mean of the losses minus the earnings,
    this is the net profit.

    The betting loss returned is the negative profit.

    Args:
        predictions: A tensor of predictions between 0 and 1.
        targets: A tensor of targets (0 or 1).
        odds_1: A tensor of odds for fighter 1.
        odds_2: A tensor of odds for fighter 2.

    Returns:
        A tensor with the computed betting loss.
    """
    msk = torch.round(predictions) == targets

    return_fighter_1 = self.get_bet(0.5 - predictions) * odds_1
    return_fighter_2 = self.get_bet(predictions - 0.5) * odds_2

    losses = torch.where(
        torch.round(predictions) == 0,
        self.get_bet(0.5 - predictions),
        self.get_bet(predictions - 0.5),
    )

    earnings = torch.zeros_like(losses)
    earnings[msk & (targets == 0)] = return_fighter_1[msk & (targets == 0)]
    earnings[msk & (targets == 1)] = return_fighter_2[msk & (targets == 1)]

    return (losses - earnings).mean()

`get_bet(prediction)`

Computes the bet for the given prediction.

This function takes a prediction between 0 and 1 and returns the corresponding bet between 0 and 20. The bet is computed as the prediction times 2 times 10. This is just an approximation that seems to work well.

Parameters:

Name	Type	Description	Default
`prediction`	`Tensor \| float`	A tensor or float between 0 and 1 representing a prediction.	required

Returns:

Type	Description
`Tensor \| float`	A tensor or float between 0 and 20 representing the bet.

Source code in ufcpredictor/loss_functions.py

def get_bet(self, prediction: torch.Tensor | float) -> torch.Tensor | float:
    """
    Computes the bet for the given prediction.

    This function takes a prediction between 0 and 1 and returns the
    corresponding bet between 0 and 20. The bet is computed as the
    prediction times 2 times 10. This is just an approximation
    that seems to work well.

    Args:
        prediction: A tensor or float between 0 and 1 representing a
            prediction.

    Returns:
        A tensor or float between 0 and 20 representing the bet.
    """
    return prediction * 2 * self.max_bet

Models

This module contains neural network models designed to predict the outcome of UFC fights.

The models take into account various characteristics of the fighters and the odds of the fights, and can be used to make predictions on the outcome of a fight and to calculate the benefit of a bet.

`FighterNet`

Bases: Module

A neural network model designed to predict the outcome of a fight based on a single fighter's characteristics.

The model takes into account the characteristics of the fighter and the odds of the fight. It can be used to make predictions on the outcome of a fight and to calculate the benefit of a bet.

Source code in ufcpredictor/models.py

class FighterNet(nn.Module):
    """
    A neural network model designed to predict the outcome of a fight based on a single
    fighter's characteristics.

    The model takes into account the characteristics of the fighter and the odds of the
    fight. It can be used to make predictions on the outcome of a fight and to
    calculate the benefit of a bet.
    """

    mlflow_params: List[str] = ["dropout_prob", "network_shape"]

    def __init__(
        self,
        input_size: int,
        dropout_prob: float = 0.0,
        network_shape: List[int] = [128, 256, 512, 256, 127],
    ) -> None:
        """
        Initialize the FighterNet model with the given input size and dropout
        probability.

        Args:
            input_size: The size of the input to the model.
            dropout_prob: The probability of dropout.
            network_shape: Shape of the network layers (except input layer).
        """
        super(FighterNet, self).__init__()
        self.network_shape = [input_size] + network_shape
        self.fcs = nn.ModuleList(
            [
                nn.Linear(input_, output)
                for input_, output in zip(
                    self.network_shape[:-1], self.network_shape[1:]
                )
            ]
        )
        self.dropouts = nn.ModuleList(
            [nn.Dropout(p=dropout_prob) for _ in range(len(self.network_shape) - 1)]
        )
        self.dropout_prob = dropout_prob

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Compute the output of the model given the input tensor x.

        Args:
            x: The input tensor to the model.

        Returns:
            The output of the model.
        """
        for fc, dropout in zip(self.fcs, self.dropouts):
            x = F.relu(fc(x))
            x = dropout(x)

        return x

`init(input_size, dropout_prob=0.0, network_shape=[128, 256, 512, 256, 127])`

Initialize the FighterNet model with the given input size and dropout probability.

Parameters:

Name	Type	Description	Default
`input_size`	`int`	The size of the input to the model.	required
`dropout_prob`	`float`	The probability of dropout.	`0.0`
`network_shape`	`List[int]`	Shape of the network layers (except input layer).	`[128, 256, 512, 256, 127]`

Source code in ufcpredictor/models.py

def __init__(
    self,
    input_size: int,
    dropout_prob: float = 0.0,
    network_shape: List[int] = [128, 256, 512, 256, 127],
) -> None:
    """
    Initialize the FighterNet model with the given input size and dropout
    probability.

    Args:
        input_size: The size of the input to the model.
        dropout_prob: The probability of dropout.
        network_shape: Shape of the network layers (except input layer).
    """
    super(FighterNet, self).__init__()
    self.network_shape = [input_size] + network_shape
    self.fcs = nn.ModuleList(
        [
            nn.Linear(input_, output)
            for input_, output in zip(
                self.network_shape[:-1], self.network_shape[1:]
            )
        ]
    )
    self.dropouts = nn.ModuleList(
        [nn.Dropout(p=dropout_prob) for _ in range(len(self.network_shape) - 1)]
    )
    self.dropout_prob = dropout_prob

`forward(x)`

Compute the output of the model given the input tensor x.

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	The input tensor to the model.	required

Returns:

Type	Description
`Tensor`	The output of the model.

Source code in ufcpredictor/models.py

def forward(self, x: torch.Tensor) -> torch.Tensor:
    """
    Compute the output of the model given the input tensor x.

    Args:
        x: The input tensor to the model.

    Returns:
        The output of the model.
    """
    for fc, dropout in zip(self.fcs, self.dropouts):
        x = F.relu(fc(x))
        x = dropout(x)

    return x

`SimpleFightNet`

Bases: Module

A neural network model designed to predict the outcome of a fight between two fighters.

The model takes into account the characteristics of both fighters and the odds of the fight. It combines the features of both fighters as an input to the model.

The model can be used to make predictions on the outcome of a fight and to calculate the benefit of a bet.

Source code in ufcpredictor/models.py

class SimpleFightNet(nn.Module):
    """
    A neural network model designed to predict the outcome of a fight between two
    fighters.

    The model takes into account the characteristics of both fighters and the odds of
    the fight. It combines the features of both fighters as an input to the model.

    The model can be used to make predictions on the outcome of a fight and to calculate
    the benefit of a bet.
    """

    mlflow_params: List[str] = [
        "dropout_prob",
        "network_shape"
    ]

    def __init__(
        self,
        input_size: int,
        dropout_prob: float = 0.0,
        network_shape: List[int] = [1024, 512, 256, 128, 64, 1],
    ):
        """
        Initialize the SimpleFightNet model with the given input size and dropout
        probability.

        Args:
            dropout_prob: The probability of dropout.
            network_shape: Shape of the network layers (except input layer).
        """
        super().__init__()

        self.network_shape = [input_size,] + network_shape

        self.fcs = nn.ModuleList(
            [
                nn.Linear(input_, output)
                for input_, output in zip(
                    self.network_shape[:-1], self.network_shape[1:]
                )
            ]
        )
        self.dropouts = nn.ModuleList(
            [nn.Dropout(p=dropout_prob) for _ in range(len(self.network_shape) - 1)]
        )
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout_prob = dropout_prob

    def forward(
            self,
            X1: torch.Tensor,
            X2: torch.Tensor,
            X3: torch.Tensor,
            odds1: torch.Tensor,
            odds2: torch.Tensor,
    ) -> torch.Tensor:
        """
        Compute the output of the SimpleFightNet model.

        Args:
            X1: The input tensor for the first fighter.
            X2: The input tensor for the second fighter.
            X3: The input tensor for the fight features.
            odds1: The odds tensor for the first fighter.
            odds2: The odds tensor for the second fighter.

        Returns:
            The output of the SimpleFightNet model.
        """
        x = torch.cat((X1, X2, X3, odds1, odds2), dim=1)

        for fc, dropout in zip(self.fcs[:-1], self.dropouts):
            x = self.relu(fc(x))
            x = dropout(x)

        x = self.fcs[-1](x)
        x = self.sigmoid(x)
        return x

`init(input_size, dropout_prob=0.0, network_shape=[1024, 512, 256, 128, 64, 1])`

Initialize the SimpleFightNet model with the given input size and dropout probability.

Parameters:

Name	Type	Description	Default
`dropout_prob`	`float`	The probability of dropout.	`0.0`
`network_shape`	`List[int]`	Shape of the network layers (except input layer).	`[1024, 512, 256, 128, 64, 1]`

Source code in ufcpredictor/models.py

def __init__(
    self,
    input_size: int,
    dropout_prob: float = 0.0,
    network_shape: List[int] = [1024, 512, 256, 128, 64, 1],
):
    """
    Initialize the SimpleFightNet model with the given input size and dropout
    probability.

    Args:
        dropout_prob: The probability of dropout.
        network_shape: Shape of the network layers (except input layer).
    """
    super().__init__()

    self.network_shape = [input_size,] + network_shape

    self.fcs = nn.ModuleList(
        [
            nn.Linear(input_, output)
            for input_, output in zip(
                self.network_shape[:-1], self.network_shape[1:]
            )
        ]
    )
    self.dropouts = nn.ModuleList(
        [nn.Dropout(p=dropout_prob) for _ in range(len(self.network_shape) - 1)]
    )
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()
    self.dropout_prob = dropout_prob

`forward(X1, X2, X3, odds1, odds2)`

Compute the output of the SimpleFightNet model.

Parameters:

Name	Type	Description	Default
`X1`	`Tensor`	The input tensor for the first fighter.	required
`X2`	`Tensor`	The input tensor for the second fighter.	required
`X3`	`Tensor`	The input tensor for the fight features.	required
`odds1`	`Tensor`	The odds tensor for the first fighter.	required
`odds2`	`Tensor`	The odds tensor for the second fighter.	required

Returns:

Type	Description
`Tensor`	The output of the SimpleFightNet model.

Source code in ufcpredictor/models.py

def forward(
        self,
        X1: torch.Tensor,
        X2: torch.Tensor,
        X3: torch.Tensor,
        odds1: torch.Tensor,
        odds2: torch.Tensor,
) -> torch.Tensor:
    """
    Compute the output of the SimpleFightNet model.

    Args:
        X1: The input tensor for the first fighter.
        X2: The input tensor for the second fighter.
        X3: The input tensor for the fight features.
        odds1: The odds tensor for the first fighter.
        odds2: The odds tensor for the second fighter.

    Returns:
        The output of the SimpleFightNet model.
    """
    x = torch.cat((X1, X2, X3, odds1, odds2), dim=1)

    for fc, dropout in zip(self.fcs[:-1], self.dropouts):
        x = self.relu(fc(x))
        x = dropout(x)

    x = self.fcs[-1](x)
    x = self.sigmoid(x)
    return x

`SymmetricFightNet`

Bases: Module

A neural network model designed to predict the outcome of a fight between two fighters.

The model takes into account the characteristics of both fighters and the odds of the fight. It uses a symmetric architecture to ensure that the model is fair and unbiased towards either fighter.

The model can be used to make predictions on the outcome of a fight and to calculate the benefit of a bet.

Source code in ufcpredictor/models.py

class SymmetricFightNet(nn.Module):
    """
    A neural network model designed to predict the outcome of a fight between two
    fighters.

    The model takes into account the characteristics of both fighters and the odds of
    the fight. It uses a symmetric architecture to ensure that the model is fair and
    unbiased towards either fighter.

    The model can be used to make predictions on the outcome of a fight and to calculate
    the benefit of a bet.
    """

    mlflow_params: List[str] = [
        "dropout_prob", "network_shape", "fighter_network_shape"
    ]

    def __init__(
        self,
        input_size: int,
        input_size_f: int,
        dropout_prob: float = 0.0,
        network_shape: List[int] = [512, 128, 64, 1],
        fighter_network_shape: Optional[List[int]] = None,
    ) -> None:
        """
        Initialize the SymmetricFightNet model with the given input size and dropout
        probability.

        Args:
            input_size: The size of the input to the model.
            dropout_prob: The probability of dropout.
            network_shape: Shape of the network layers (except input layer).
            fighter_network_shape: Shape of the network layers for the fighter
                network (except input layer).
        """
        super(SymmetricFightNet, self).__init__()

        fighter_network_args: Dict[str, Any] = {
            "input_size": input_size,
            "dropout_prob": dropout_prob,
        }
        if fighter_network_shape is not None: # pragma: no cover
            fighter_network_args["network_shape"] = fighter_network_shape

        self.fighter_net = FighterNet(**fighter_network_args)
        self.fighter_network_shape = self.fighter_net.network_shape

        self.network_shape = [
            self.fighter_network_shape[-1] * 2 + 2 + input_size_f
        ] + network_shape

        self.fcs = nn.ModuleList(
            [
                nn.Linear(input_, output)
                for input_, output in zip(
                    self.network_shape[:-1], self.network_shape[1:]
                )
            ]
        )
        self.dropouts = nn.ModuleList(
            [
                nn.Dropout(p=dropout_prob)
                for _ in range(len(self.network_shape) - 1)  # This should be -2
            ]
        )
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout_prob = dropout_prob

    def forward(
        self,
        X1: torch.Tensor,
        X2: torch.Tensor,
        X3: torch.Tensor,
        odds1: torch.Tensor,
        odds2: torch.Tensor,
    ) -> torch.Tensor:
        """
        Compute the output of the SymmetricFightNet model.

        Args:
            X1: The input tensor for the first fighter.
            X2: The input tensor for the second fighter.
            X3: The input tensor for the fight features.
            odds1: The odds tensor for the first fighter.
            odds2: The odds tensor for the second fighter.

        Returns:
            The output of the SymmetricFightNet model.
        """
        out1 = self.fighter_net(X1)
        out2 = self.fighter_net(X2)

        out1 = torch.cat((out1, odds1), dim=1)
        out2 = torch.cat((out2, odds2), dim=1)

        x = torch.cat((out1 - out2, out2 - out1, X3), dim=1)

        for fc, dropout in zip(self.fcs[:-1], self.dropouts):
            x = self.relu(fc(x))
            x = dropout(x)

        x = self.fcs[-1](x)
        x = self.sigmoid(x)
        return x

`init(input_size, input_size_f, dropout_prob=0.0, network_shape=[512, 128, 64, 1], fighter_network_shape=None)`

Initialize the SymmetricFightNet model with the given input size and dropout probability.

Parameters:

Name	Type	Description	Default
`input_size`	`int`	The size of the input to the model.	required
`dropout_prob`	`float`	The probability of dropout.	`0.0`
`network_shape`	`List[int]`	Shape of the network layers (except input layer).	`[512, 128, 64, 1]`
`fighter_network_shape`	`Optional[List[int]]`	Shape of the network layers for the fighter network (except input layer).	`None`

Source code in ufcpredictor/models.py

def __init__(
    self,
    input_size: int,
    input_size_f: int,
    dropout_prob: float = 0.0,
    network_shape: List[int] = [512, 128, 64, 1],
    fighter_network_shape: Optional[List[int]] = None,
) -> None:
    """
    Initialize the SymmetricFightNet model with the given input size and dropout
    probability.

    Args:
        input_size: The size of the input to the model.
        dropout_prob: The probability of dropout.
        network_shape: Shape of the network layers (except input layer).
        fighter_network_shape: Shape of the network layers for the fighter
            network (except input layer).
    """
    super(SymmetricFightNet, self).__init__()

    fighter_network_args: Dict[str, Any] = {
        "input_size": input_size,
        "dropout_prob": dropout_prob,
    }
    if fighter_network_shape is not None: # pragma: no cover
        fighter_network_args["network_shape"] = fighter_network_shape

    self.fighter_net = FighterNet(**fighter_network_args)
    self.fighter_network_shape = self.fighter_net.network_shape

    self.network_shape = [
        self.fighter_network_shape[-1] * 2 + 2 + input_size_f
    ] + network_shape

    self.fcs = nn.ModuleList(
        [
            nn.Linear(input_, output)
            for input_, output in zip(
                self.network_shape[:-1], self.network_shape[1:]
            )
        ]
    )
    self.dropouts = nn.ModuleList(
        [
            nn.Dropout(p=dropout_prob)
            for _ in range(len(self.network_shape) - 1)  # This should be -2
        ]
    )
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()
    self.dropout_prob = dropout_prob

`forward(X1, X2, X3, odds1, odds2)`

Compute the output of the SymmetricFightNet model.

Parameters:

Name	Type	Description	Default
`X1`	`Tensor`	The input tensor for the first fighter.	required
`X2`	`Tensor`	The input tensor for the second fighter.	required
`X3`	`Tensor`	The input tensor for the fight features.	required
`odds1`	`Tensor`	The odds tensor for the first fighter.	required
`odds2`	`Tensor`	The odds tensor for the second fighter.	required

Returns:

Type	Description
`Tensor`	The output of the SymmetricFightNet model.

Source code in ufcpredictor/models.py

def forward(
    self,
    X1: torch.Tensor,
    X2: torch.Tensor,
    X3: torch.Tensor,
    odds1: torch.Tensor,
    odds2: torch.Tensor,
) -> torch.Tensor:
    """
    Compute the output of the SymmetricFightNet model.

    Args:
        X1: The input tensor for the first fighter.
        X2: The input tensor for the second fighter.
        X3: The input tensor for the fight features.
        odds1: The odds tensor for the first fighter.
        odds2: The odds tensor for the second fighter.

    Returns:
        The output of the SymmetricFightNet model.
    """
    out1 = self.fighter_net(X1)
    out2 = self.fighter_net(X2)

    out1 = torch.cat((out1, odds1), dim=1)
    out2 = torch.cat((out2, odds2), dim=1)

    x = torch.cat((out1 - out2, out2 - out1, X3), dim=1)

    for fc, dropout in zip(self.fcs[:-1], self.dropouts):
        x = self.relu(fc(x))
        x = dropout(x)

    x = self.fcs[-1](x)
    x = self.sigmoid(x)
    return x

Trainer

This module provides a Trainer class for training and testing PyTorch models using a specific workflow.

The Trainer class encapsulates the training and testing data, model, optimizer, loss function, and learning rate scheduler, providing a simple way to train and test a PyTorch model.

`Trainer`

Trainer class for training and testing a PyTorch model.

This class provides a simple way to train and test a PyTorch model using a specific training and testing workflow.

Attributes:

Name	Type	Description
`train_loader`	`DataLoader`	A DataLoader for the training data.
`test_loader`	`DataLoader`	A DataLoader for the test data.
`model`	`Module`	The model to be trained.
`optimizer`	`Optimizer`	The optimizer to be used.
`loss_fn`	`Module`	The loss function to be used.
`scheduler`	`Optional[ReduceLROnPlateau]`	The learning rate scheduler to be used.
`device`	`str \| device`	The device to be used for training. Defaults to "cpu".

Source code in ufcpredictor/trainer.py

class Trainer:
    """
    Trainer class for training and testing a PyTorch model.

    This class provides a simple way to train and test a PyTorch model using a specific
    training and testing workflow.

    Attributes:
        train_loader (torch.utils.data.DataLoader): A DataLoader for the training data.
        test_loader (torch.utils.data.DataLoader): A DataLoader for the test data.
        model (torch.nn.Module): The model to be trained.
        optimizer (torch.optim.Optimizer): The optimizer to be used.
        loss_fn (torch.nn.Module): The loss function to be used.
        scheduler (Optional[torch.optim.lr_scheduler.ReduceLROnPlateau]): The learning
            rate scheduler to be used.
        device (str | torch.device): The device to be used for training. Defaults to
            "cpu".
    """

    def __init__(
        self,
        train_loader: torch.utils.data.DataLoader,
        model: torch.nn.Module,
        optimizer: torch.optim.Optimizer,
        loss_fn: torch.nn.Module,
        test_loader: Optional[torch.utils.data.DataLoader] = None,
        scheduler: Optional[torch.optim.lr_scheduler.ReduceLROnPlateau] = None,
        device: str | torch.device = "cpu",
        mlflow_tracking: bool = False,
    ):
        """
        Initialize the Trainer object.

        Args:
            train_loader: A DataLoader for the training data.
            test_loader: A DataLoader for the test data.
            model: The model to be trained.
            optimizer: The optimizer to be used.
            loss_fn: The loss function to be used.
            scheduler: The learning rate scheduler to be used.
            device: The device to be used for training. Defaults to "cpu".
        """
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.device = device
        self.loss_fn = loss_fn.to(device)
        self.epoch_counter: int = 0
        self.mlflow_tracking = mlflow_tracking

        if self.mlflow_tracking:  # pragma: no cover
            params = {
                "optimizer": self.optimizer.__class__.__name__,
                "learning_rate": self.optimizer.param_groups[0]["lr"],
                "scheduler": (
                    self.scheduler.__class__.__name__ if self.scheduler else None
                ),
                "scheduler_mode": self.scheduler.mode if self.scheduler else None,
                "scheduler_factor": self.scheduler.factor if self.scheduler else None,
                "scheduler_patience": (
                    self.scheduler.patience if self.scheduler else None
                ),
            }
            data_processor = cast(
                BasicDataset, self.train_loader.dataset
            ).data_processor
            data_aggregator = data_processor.data_aggregator

            for label, object_ in zip(
                ["loss_function", "model", "data_processor", "data_aggregator"],
                [self.loss_fn, self.model, data_processor, data_aggregator],
            ):
                params[label] = object_.__class__.__name__
                if hasattr(object_, "mlflow_params"):
                    for param in object_.mlflow_params:
                        params[label + "_" + param] = getattr(object_, param)

            data_enhancers = data_processor.data_enhancers
            # sort extra fields by name
            data_enhancers.sort(key=lambda x: x.__class__.__name__)

            for i, data_enhancer in enumerate(data_processor.data_enhancers):
                params["data_enhancer_" + str(i)] = data_enhancer.__class__.__name__
                for param in data_enhancer.mlflow_params:
                    params["data_enhancer_" + str(i) + "_" + param] = getattr(
                        data_enhancer, param
                    )

            for set_ in "X_set", "Xf_set":
                if hasattr(self.train_loader.dataset, set_):
                    params[set_] = sorted(
                        getattr(self.train_loader.dataset, set_)
                    )

            mlflow.log_params(dict(sorted(params.items())))

    def train(
        self,
        train_loader: torch.utils.data.DataLoader | None = None,
        test_loader: torch.utils.data.DataLoader | None = None,
        epochs: int = 10,
        silent: bool = False,
    ) -> None:
        """
        Train the model for a given number of epochs.

        Args:
            train_loader: The DataLoader for the training data. Defaults to the
                DataLoader passed to the Trainer constructor.
            test_loader: The DataLoader for the test data. Defaults to the
                DataLoader passed to the Trainer constructor.
            epochs: The number of epochs to train for. Defaults to 10.
            silent: Whether to not print training progress. Defaults to False.

        Returns:
            None
        """
        if train_loader is None:
            train_loader = self.train_loader

        self.model.to(self.device)

        target_preds = []
        target_labels = []

        for epoch in range(1, epochs + 1):
            self.epoch_counter += 1
            self.model.train()
            train_loss = []

            for X1, X2, X3, Y, odds1, odds2 in tqdm(iter(train_loader), disable=silent):
                X1, X2, X3, Y, odds1, odds2 = (
                    X1.to(self.device),
                    X2.to(self.device),
                    X3.to(self.device),
                    Y.to(self.device),
                    odds1.to(self.device),
                    odds2.to(self.device),
                )

                self.optimizer.zero_grad()
                target_logit = self.model(X1, X2, X3, odds1, odds2)
                loss = self.loss_fn(target_logit, Y, odds1, odds2)

                loss.backward()
                self.optimizer.step()

                train_loss.append(loss.item())

                target_preds += (
                    torch.round(target_logit).detach().cpu().numpy().tolist()
                )
                target_labels += Y.detach().cpu().numpy().tolist()

            match = np.asarray(target_preds).reshape(-1) == np.asarray(
                target_labels
            ).reshape(-1)

            val_loss, val_target_f1, correct, _, _ = self.test(test_loader, silent=silent)

            if not silent:
                print(f"Train acc: [{match.sum() / len(match):.5f}]")
                print(
                    f"Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] "
                    f"Val Loss : [{val_loss:.5f}] Disaster? F1 : [{val_target_f1:.5f}] "
                    f"Correct: [{correct*100:.2f}]"
                )

            if self.mlflow_tracking:  # pragma: no cover
                mlflow.log_metric(
                    "train_loss", np.mean(train_loss), step=self.epoch_counter
                )
                mlflow.log_metric(
                    "val_loss", cast(float, np.mean(val_loss)), step=self.epoch_counter
                )
                mlflow.log_metric(
                    "val_f1_score", val_target_f1, step=self.epoch_counter
                )

            if self.scheduler is not None:
                self.scheduler.step(val_loss)

    def test(
        self, test_loader: torch.utils.data.DataLoader | None = None, silent: bool =False,
    ) -> Tuple[float, float, float, List, List]:
        """
        Evaluates the model on the test data and returns the validation loss, target F1
        score, proportion of correct predictions, target predictions, and target labels.

        Args:
            test_loader: The DataLoader for the test data. Defaults to the DataLoader
                passed to the Trainer constructor.
            silent: Whether to not print training progress. Defaults to False.

        Returns:
            A tuple containing the validation loss, target F1 score, proportion of correct
            predictions, target predictions, and target labels.
        """
        if test_loader is None:
            if self.test_loader is None:
                return 0, 0, 0, [], []
            else:
                test_loader = self.test_loader

        self.model.eval()
        val_loss = []

        target_preds = []
        target = []
        target_labels = []

        with torch.no_grad():
            for X1, X2, X3, Y, odds1, odds2 in tqdm(iter(test_loader), disable=silent):
                X1, X2, X3, Y, odds1, odds2 = (
                    X1.to(self.device),
                    X2.to(self.device),
                    X3.to(self.device),
                    Y.to(self.device),
                    odds1.to(self.device),
                    odds2.to(self.device),
                )
                target_logit = self.model(X1, X2, X3, odds1, odds2)
                loss = self.loss_fn(target_logit, Y, odds1, odds2)
                val_loss.append(loss.item())

                target += target_logit
                target_preds += (
                    torch.round(target_logit).detach().cpu().numpy().tolist()
                )
                target_labels += Y.detach().cpu().numpy().tolist()

        match = np.asarray(target_preds).reshape(-1) == np.asarray(
            target_labels
        ).reshape(-1)

        target_f1 = f1_score(target_labels, target_preds, average="macro")

        return (
            np.mean(val_loss),
            target_f1,
            match.sum() / len(match),
            target,
            target_labels,
        )

`init(train_loader, model, optimizer, loss_fn, test_loader=None, scheduler=None, device='cpu', mlflow_tracking=False)`

Initialize the Trainer object.

Parameters:

Name	Type	Description	Default
`train_loader`	`DataLoader`	A DataLoader for the training data.	required
`test_loader`	`Optional[DataLoader]`	A DataLoader for the test data.	`None`
`model`	`Module`	The model to be trained.	required
`optimizer`	`Optimizer`	The optimizer to be used.	required
`loss_fn`	`Module`	The loss function to be used.	required
`scheduler`	`Optional[ReduceLROnPlateau]`	The learning rate scheduler to be used.	`None`
`device`	`str \| device`	The device to be used for training. Defaults to "cpu".	`'cpu'`

Source code in ufcpredictor/trainer.py

def __init__(
    self,
    train_loader: torch.utils.data.DataLoader,
    model: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    loss_fn: torch.nn.Module,
    test_loader: Optional[torch.utils.data.DataLoader] = None,
    scheduler: Optional[torch.optim.lr_scheduler.ReduceLROnPlateau] = None,
    device: str | torch.device = "cpu",
    mlflow_tracking: bool = False,
):
    """
    Initialize the Trainer object.

    Args:
        train_loader: A DataLoader for the training data.
        test_loader: A DataLoader for the test data.
        model: The model to be trained.
        optimizer: The optimizer to be used.
        loss_fn: The loss function to be used.
        scheduler: The learning rate scheduler to be used.
        device: The device to be used for training. Defaults to "cpu".
    """
    self.train_loader = train_loader
    self.test_loader = test_loader
    self.model = model
    self.optimizer = optimizer
    self.scheduler = scheduler
    self.device = device
    self.loss_fn = loss_fn.to(device)
    self.epoch_counter: int = 0
    self.mlflow_tracking = mlflow_tracking

    if self.mlflow_tracking:  # pragma: no cover
        params = {
            "optimizer": self.optimizer.__class__.__name__,
            "learning_rate": self.optimizer.param_groups[0]["lr"],
            "scheduler": (
                self.scheduler.__class__.__name__ if self.scheduler else None
            ),
            "scheduler_mode": self.scheduler.mode if self.scheduler else None,
            "scheduler_factor": self.scheduler.factor if self.scheduler else None,
            "scheduler_patience": (
                self.scheduler.patience if self.scheduler else None
            ),
        }
        data_processor = cast(
            BasicDataset, self.train_loader.dataset
        ).data_processor
        data_aggregator = data_processor.data_aggregator

        for label, object_ in zip(
            ["loss_function", "model", "data_processor", "data_aggregator"],
            [self.loss_fn, self.model, data_processor, data_aggregator],
        ):
            params[label] = object_.__class__.__name__
            if hasattr(object_, "mlflow_params"):
                for param in object_.mlflow_params:
                    params[label + "_" + param] = getattr(object_, param)

        data_enhancers = data_processor.data_enhancers
        # sort extra fields by name
        data_enhancers.sort(key=lambda x: x.__class__.__name__)

        for i, data_enhancer in enumerate(data_processor.data_enhancers):
            params["data_enhancer_" + str(i)] = data_enhancer.__class__.__name__
            for param in data_enhancer.mlflow_params:
                params["data_enhancer_" + str(i) + "_" + param] = getattr(
                    data_enhancer, param
                )

        for set_ in "X_set", "Xf_set":
            if hasattr(self.train_loader.dataset, set_):
                params[set_] = sorted(
                    getattr(self.train_loader.dataset, set_)
                )

        mlflow.log_params(dict(sorted(params.items())))

`test(test_loader=None, silent=False)`

Evaluates the model on the test data and returns the validation loss, target F1 score, proportion of correct predictions, target predictions, and target labels.

Parameters:

Name	Type	Description	Default
`test_loader`	`DataLoader \| None`	The DataLoader for the test data. Defaults to the DataLoader passed to the Trainer constructor.	`None`
`silent`	`bool`	Whether to not print training progress. Defaults to False.	`False`

Returns:

Type	Description
`float`	A tuple containing the validation loss, target F1 score, proportion of correct
`float`	predictions, target predictions, and target labels.

Source code in ufcpredictor/trainer.py

def test(
    self, test_loader: torch.utils.data.DataLoader | None = None, silent: bool =False,
) -> Tuple[float, float, float, List, List]:
    """
    Evaluates the model on the test data and returns the validation loss, target F1
    score, proportion of correct predictions, target predictions, and target labels.

    Args:
        test_loader: The DataLoader for the test data. Defaults to the DataLoader
            passed to the Trainer constructor.
        silent: Whether to not print training progress. Defaults to False.

    Returns:
        A tuple containing the validation loss, target F1 score, proportion of correct
        predictions, target predictions, and target labels.
    """
    if test_loader is None:
        if self.test_loader is None:
            return 0, 0, 0, [], []
        else:
            test_loader = self.test_loader

    self.model.eval()
    val_loss = []

    target_preds = []
    target = []
    target_labels = []

    with torch.no_grad():
        for X1, X2, X3, Y, odds1, odds2 in tqdm(iter(test_loader), disable=silent):
            X1, X2, X3, Y, odds1, odds2 = (
                X1.to(self.device),
                X2.to(self.device),
                X3.to(self.device),
                Y.to(self.device),
                odds1.to(self.device),
                odds2.to(self.device),
            )
            target_logit = self.model(X1, X2, X3, odds1, odds2)
            loss = self.loss_fn(target_logit, Y, odds1, odds2)
            val_loss.append(loss.item())

            target += target_logit
            target_preds += (
                torch.round(target_logit).detach().cpu().numpy().tolist()
            )
            target_labels += Y.detach().cpu().numpy().tolist()

    match = np.asarray(target_preds).reshape(-1) == np.asarray(
        target_labels
    ).reshape(-1)

    target_f1 = f1_score(target_labels, target_preds, average="macro")

    return (
        np.mean(val_loss),
        target_f1,
        match.sum() / len(match),
        target,
        target_labels,
    )

`train(train_loader=None, test_loader=None, epochs=10, silent=False)`

Train the model for a given number of epochs.

Parameters:

Name	Type	Description	Default
`train_loader`	`DataLoader \| None`	The DataLoader for the training data. Defaults to the DataLoader passed to the Trainer constructor.	`None`
`test_loader`	`DataLoader \| None`	The DataLoader for the test data. Defaults to the DataLoader passed to the Trainer constructor.	`None`
`epochs`	`int`	The number of epochs to train for. Defaults to 10.	`10`
`silent`	`bool`	Whether to not print training progress. Defaults to False.	`False`

Returns:

Type	Description
`None`	None

Source code in ufcpredictor/trainer.py

def train(
    self,
    train_loader: torch.utils.data.DataLoader | None = None,
    test_loader: torch.utils.data.DataLoader | None = None,
    epochs: int = 10,
    silent: bool = False,
) -> None:
    """
    Train the model for a given number of epochs.

    Args:
        train_loader: The DataLoader for the training data. Defaults to the
            DataLoader passed to the Trainer constructor.
        test_loader: The DataLoader for the test data. Defaults to the
            DataLoader passed to the Trainer constructor.
        epochs: The number of epochs to train for. Defaults to 10.
        silent: Whether to not print training progress. Defaults to False.

    Returns:
        None
    """
    if train_loader is None:
        train_loader = self.train_loader

    self.model.to(self.device)

    target_preds = []
    target_labels = []

    for epoch in range(1, epochs + 1):
        self.epoch_counter += 1
        self.model.train()
        train_loss = []

        for X1, X2, X3, Y, odds1, odds2 in tqdm(iter(train_loader), disable=silent):
            X1, X2, X3, Y, odds1, odds2 = (
                X1.to(self.device),
                X2.to(self.device),
                X3.to(self.device),
                Y.to(self.device),
                odds1.to(self.device),
                odds2.to(self.device),
            )

            self.optimizer.zero_grad()
            target_logit = self.model(X1, X2, X3, odds1, odds2)
            loss = self.loss_fn(target_logit, Y, odds1, odds2)

            loss.backward()
            self.optimizer.step()

            train_loss.append(loss.item())

            target_preds += (
                torch.round(target_logit).detach().cpu().numpy().tolist()
            )
            target_labels += Y.detach().cpu().numpy().tolist()

        match = np.asarray(target_preds).reshape(-1) == np.asarray(
            target_labels
        ).reshape(-1)

        val_loss, val_target_f1, correct, _, _ = self.test(test_loader, silent=silent)

        if not silent:
            print(f"Train acc: [{match.sum() / len(match):.5f}]")
            print(
                f"Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] "
                f"Val Loss : [{val_loss:.5f}] Disaster? F1 : [{val_target_f1:.5f}] "
                f"Correct: [{correct*100:.2f}]"
            )

        if self.mlflow_tracking:  # pragma: no cover
            mlflow.log_metric(
                "train_loss", np.mean(train_loss), step=self.epoch_counter
            )
            mlflow.log_metric(
                "val_loss", cast(float, np.mean(val_loss)), step=self.epoch_counter
            )
            mlflow.log_metric(
                "val_f1_score", val_target_f1, step=self.epoch_counter
            )

        if self.scheduler is not None:
            self.scheduler.step(val_loss)

Plot tools

This module provides tools for plotting and visualizing predictions made by UFC predictor models.

`PredictionPlots`

Provides tools for visualizing and analyzing the predictions made by UFC predictor models.

This class contains methods for displaying the prediction details of a fight, including the prediction, shift, odds, and correctness. It also calculates and displays the total invested, earnings, number of bets, and number of fights. Additionally, it can show a plot of the benefit of the model over time.

Source code in ufcpredictor/plot_tools.py

class PredictionPlots:
    """
    Provides tools for visualizing and analyzing the predictions made by UFC predictor
    models.

    This class contains methods for displaying the prediction details of a fight,
    including the prediction, shift, odds, and correctness. It also calculates and
    displays the total invested, earnings, number of bets, and number of fights.
    Additionally, it can show a plot of the benefit of the model over time.
    """

    @staticmethod
    def show_fight_prediction_detail(
        model: nn.Module,
        data: Tuple[
            torch.Tensor,
            torch.Tensor,
            torch.Tensor,
            torch.Tensor,
            torch.Tensor,
            torch.Tensor,
            NDArray[np.str_],
            NDArray[np.str_],
        ],
        print_info: bool = False,
        show_plot: bool = False,
        ax: Optional[plt.Axes] = None,
        device: str = "cpu",
    ) -> List[Tuple[float, int, float, float, bool, float, float]]:
        """
        Shows the prediction detail of a fight and the benefit of the model.

        It prints the prediction, shift, odd1, odd2, and correct for each fight.
        It also prints the total invested, earnings, number of bets and number
            of fights.
        Finally, it prints the benefit of the model as a percentage.

        Args:
            model : The model to use to make predictions.
            data : The data to use to make predictions. It should contain the fighter
                and opponent data, the label, the odds and the names of the fighters.
            print_info : If True, print the prediction, shift, odd1, odd2, and correct
                for each fight. If False, do not print anything.
            show_plot : If True, show a plot of the benefit of the model over time.
            ax : The axes to use to show the plot. If None, a new figure will be
                created.
        """
        X1, X2, X3, Y, odds1, odds2, fighter_names, opponent_names = data
        X1, X2, X3, Y, odds1, odds2, model = (
            X1.to(device),
            X2.to(device),
            X3.to(device),
            Y.to(device),
            odds1.to(device),
            odds2.to(device),
            model.to(device),
        )
        stats = []

        with torch.no_grad():
            predictions_1 = (
                model(X1, X2, X3, odds1.reshape(-1, 1), odds2.reshape(-1, 1))
                .detach()
                .cpu()
                .numpy()
                .reshape(-1)
            )
            predictions_2 = 1 - model(
                X2, X1, X3, odds2.reshape(-1, 1), odds1.reshape(-1, 1)
            ).detach().cpu().numpy().reshape(-1)

            predictions = 0.5 * (predictions_1 + predictions_2)
            shifts = abs(predictions_2 - predictions_1)

            corrects = predictions.round() == Y.cpu().numpy()

            invested = 0
            earnings = 0
            fights = 0
            nbets = 0

            invest_progress = []
            earning_progress = []

            for fighter, opponent, prediction, shift, odd1, odd2, correct, Yi in zip(
                fighter_names,
                opponent_names,
                predictions,
                shifts,
                odds1.cpu().numpy().reshape(-1),
                odds2.cpu().numpy().reshape(-1),
                corrects,
                Y.cpu().numpy().tolist(),
            ):
                prediction = round(float(prediction), 3)
                shift = round(float(shift), 3)

                if prediction > 0.5:
                    bet = 2 * 10 * (prediction - 0.5)
                    earning = odd2 * bet if correct else 0
                else:
                    bet = 2 * 10 * (0.5 - prediction)
                    earning = odd1 * bet if correct else 0

                invested += bet
                earnings += earning

                invest_progress.append(bet)
                earning_progress.append(earning)

                fights += 1
                nbets += 1

                if print_info:  # pragma: no cover
                    print(fighter, "vs", opponent)
                    print(odd1, "vs", odd2)
                    print(prediction, shift)

                    print(f"bet: {bet:.2f}, earn: {earning:.2f}")
                    print(
                        f"invested: {invested:.2f}, earnings: {earnings:.2f}, nbets: {nbets}, fights: {fights}"
                    )
                    print(f"benefits: {(earnings/invested-1)*100:.2f}%")

                    print()
                stats.append(
                    (
                        prediction,
                        Yi,
                        odd1,
                        odd2,
                        correct,
                        bet,
                        earning,
                    )
                )

        if show_plot:
            if ax is None:  # pragma: no cover
                fig, ax = plt.subplots()

            ax.plot(
                np.cumsum(invest_progress),
                (np.cumsum(earning_progress) - np.cumsum(invest_progress))
                / np.cumsum(invest_progress)
                * 100,
            )
            ax.axhline(0, c="k")

        return stats

    @staticmethod
    def show_fight_prediction_detail_from_dataset(
        model: nn.Module,
        dataset: BasicDataset,
        fight_ids: Optional[List[str]] = None,
        print_info: bool = False,
        show_plot: bool = False,
        ax: Optional[plt.Axes] = None,
        device: str = "cpu",
    ) -> List[Tuple[float, int, float, float, bool, float, float, str]]:
        """
        Shows the prediction detail of a fight and the benefit of the model.

        It uses the dataset to get the data for the specified fight ids.
        It then calls show_fight_prediction_detail with the model and the data.

        Args:
            model : The model to use to make predictions.
            dataset : The dataset to use to get the data.
            fight_ids : The id of the fight to use. If None, it will use all the data
                in the dataset.
            print_info : If True, print the prediction, shift, odd1, odd2, and correct
                for each fight. If False, do not print anything.
            show_plot : If True, show a plot of the benefit of the model over time.
            ax : The axes to use to show the plot. If None, a new figure will be
                created.
        """
        X1, X2, X3, Y, odds1, odds2, fighter_names, opponent_names = (
            dataset.get_fight_data_from_ids(fight_ids)
        )

        stats = PredictionPlots.show_fight_prediction_detail(
            model,
            (X1, X2, X3, Y, odds1, odds2, fighter_names, opponent_names),
            print_info,
            show_plot,
            ax,
            device=device,
        )

        return [
            fight_stats + (fight_id,)
            for fight_stats, fight_id in zip(
                stats, dataset.fight_data["fight_id"].values
            )
        ]

    @staticmethod
    def plot_single_prediction(
        model: nn.Module,
        dataset: ForecastDataset,
        fighter_name: str,
        opponent_name: str,
        fight_features: List[float],
        event_date: str | datetime.date,
        odds1: int,
        odds2: int,
        ax: Optional[plt.Axes] = None,
        parse_id: bool = False,
    ) -> None:
        """
        Plots the prediction for a single fight.

        Args:
            model : The model to use to make predictions.
            dataset : The dataset to use to get the data.
            fighter_name : The name of the first fighter.
            opponent_name : The name of the second fighter.
            event_date : The date of the fight.
            odds1 : The odds for the first fighter (decimal).
            odds2 : The odds for the second fighter (decimal).
            ax : The axes to use to show the plot. If None, a new figure will be
                created.
            parse_id : If True, the id of the fighters is parsed instead of the name.
        """
        p1, p2 = dataset.get_single_forecast_prediction(
            fighter_name, opponent_name, event_date, odds1, odds2, model, fight_features, parse_id
        )

        if parse_id:
            names = dataset.data_processor.data["fighter_name"]
            ids = dataset.data_processor.data["fighter_id"]

            display_fighter_name = names[ids == fighter_name].values[0]
            display_opponent_name = names[ids == opponent_name].values[0]
        else:
            display_fighter_name = fighter_name
            display_opponent_name = opponent_name

        if ax is None:  # pragma: no cover
            fig, ax = plt.subplots()

        prediction = ((p1 + p2) - 1) * 100
        shift = np.abs(p1 - p2) * 2 * 100

        red = "tab:red"
        blue = "tab:blue"

        color = red if prediction <= 0 else blue

        ax.barh(
            0,
            prediction,
            xerr=shift,
            color=color,
            capsize=5,
            height=0.7,
        )
        ax.set_ylim(-1, 1)
        ax.set_xlim(-100, 100)

        ticks = np.arange(-100, 101, 25, dtype=int)
        ax.set_xticks(ticks)
        ax.set_xticklabels([abs(tick) for tick in ticks])

        ax.text(
            ax.get_xlim()[0],
            ax.get_ylim()[1] * 1.3,
            display_fighter_name,
            color=red,
            ha="left",
            va="center",
            fontsize=12,
            fontweight="bold",
        )

        ax.text(
            ax.get_xlim()[1],
            ax.get_ylim()[1] * 1.3,
            display_opponent_name,
            color=blue,
            ha="right",
            va="center",
            fontsize=12,
            fontweight="bold",
        )

        ax.axvline(x=0, color="lightgray", lw=1)
        ax.text(
            prediction,
            ax.get_ylim()[1] * 0.5,
            f"{abs(prediction):.2f}±{shift:.2f}",
            color=color,
            ha="center",  # "left" if prediction > 0 else "right",
            va="center",
            fontsize=11,
            fontweight="bold",
        )

`plot_single_prediction(model, dataset, fighter_name, opponent_name, fight_features, event_date, odds1, odds2, ax=None, parse_id=False)` `staticmethod`

Plots the prediction for a single fight.

Parameters:

Name	Description	Default
`model`	The model to use to make predictions.	required
`dataset`	The dataset to use to get the data.	required
`fighter_name`	The name of the first fighter.	required
`opponent_name`	The name of the second fighter.	required
`event_date`	The date of the fight.	required
`odds1`	The odds for the first fighter (decimal).	required
`odds2`	The odds for the second fighter (decimal).	required
`ax`	The axes to use to show the plot. If None, a new figure will be created.	`None`
`parse_id`	If True, the id of the fighters is parsed instead of the name.	`False`

Source code in ufcpredictor/plot_tools.py

@staticmethod
def plot_single_prediction(
    model: nn.Module,
    dataset: ForecastDataset,
    fighter_name: str,
    opponent_name: str,
    fight_features: List[float],
    event_date: str | datetime.date,
    odds1: int,
    odds2: int,
    ax: Optional[plt.Axes] = None,
    parse_id: bool = False,
) -> None:
    """
    Plots the prediction for a single fight.

    Args:
        model : The model to use to make predictions.
        dataset : The dataset to use to get the data.
        fighter_name : The name of the first fighter.
        opponent_name : The name of the second fighter.
        event_date : The date of the fight.
        odds1 : The odds for the first fighter (decimal).
        odds2 : The odds for the second fighter (decimal).
        ax : The axes to use to show the plot. If None, a new figure will be
            created.
        parse_id : If True, the id of the fighters is parsed instead of the name.
    """
    p1, p2 = dataset.get_single_forecast_prediction(
        fighter_name, opponent_name, event_date, odds1, odds2, model, fight_features, parse_id
    )

    if parse_id:
        names = dataset.data_processor.data["fighter_name"]
        ids = dataset.data_processor.data["fighter_id"]

        display_fighter_name = names[ids == fighter_name].values[0]
        display_opponent_name = names[ids == opponent_name].values[0]
    else:
        display_fighter_name = fighter_name
        display_opponent_name = opponent_name

    if ax is None:  # pragma: no cover
        fig, ax = plt.subplots()

    prediction = ((p1 + p2) - 1) * 100
    shift = np.abs(p1 - p2) * 2 * 100

    red = "tab:red"
    blue = "tab:blue"

    color = red if prediction <= 0 else blue

    ax.barh(
        0,
        prediction,
        xerr=shift,
        color=color,
        capsize=5,
        height=0.7,
    )
    ax.set_ylim(-1, 1)
    ax.set_xlim(-100, 100)

    ticks = np.arange(-100, 101, 25, dtype=int)
    ax.set_xticks(ticks)
    ax.set_xticklabels([abs(tick) for tick in ticks])

    ax.text(
        ax.get_xlim()[0],
        ax.get_ylim()[1] * 1.3,
        display_fighter_name,
        color=red,
        ha="left",
        va="center",
        fontsize=12,
        fontweight="bold",
    )

    ax.text(
        ax.get_xlim()[1],
        ax.get_ylim()[1] * 1.3,
        display_opponent_name,
        color=blue,
        ha="right",
        va="center",
        fontsize=12,
        fontweight="bold",
    )

    ax.axvline(x=0, color="lightgray", lw=1)
    ax.text(
        prediction,
        ax.get_ylim()[1] * 0.5,
        f"{abs(prediction):.2f}±{shift:.2f}",
        color=color,
        ha="center",  # "left" if prediction > 0 else "right",
        va="center",
        fontsize=11,
        fontweight="bold",
    )

`show_fight_prediction_detail(model, data, print_info=False, show_plot=False, ax=None, device='cpu')` `staticmethod`

Shows the prediction detail of a fight and the benefit of the model.

It prints the prediction, shift, odd1, odd2, and correct for each fight. It also prints the total invested, earnings, number of bets and number of fights. Finally, it prints the benefit of the model as a percentage.

Parameters:

Name	Description	Default
`model`	The model to use to make predictions.	required
`data`	The data to use to make predictions. It should contain the fighter and opponent data, the label, the odds and the names of the fighters.	required
`print_info`	If True, print the prediction, shift, odd1, odd2, and correct for each fight. If False, do not print anything.	`False`
`show_plot`	If True, show a plot of the benefit of the model over time.	`False`
`ax`	The axes to use to show the plot. If None, a new figure will be created.	`None`

Source code in ufcpredictor/plot_tools.py

@staticmethod
def show_fight_prediction_detail(
    model: nn.Module,
    data: Tuple[
        torch.Tensor,
        torch.Tensor,
        torch.Tensor,
        torch.Tensor,
        torch.Tensor,
        torch.Tensor,
        NDArray[np.str_],
        NDArray[np.str_],
    ],
    print_info: bool = False,
    show_plot: bool = False,
    ax: Optional[plt.Axes] = None,
    device: str = "cpu",
) -> List[Tuple[float, int, float, float, bool, float, float]]:
    """
    Shows the prediction detail of a fight and the benefit of the model.

    It prints the prediction, shift, odd1, odd2, and correct for each fight.
    It also prints the total invested, earnings, number of bets and number
        of fights.
    Finally, it prints the benefit of the model as a percentage.

    Args:
        model : The model to use to make predictions.
        data : The data to use to make predictions. It should contain the fighter
            and opponent data, the label, the odds and the names of the fighters.
        print_info : If True, print the prediction, shift, odd1, odd2, and correct
            for each fight. If False, do not print anything.
        show_plot : If True, show a plot of the benefit of the model over time.
        ax : The axes to use to show the plot. If None, a new figure will be
            created.
    """
    X1, X2, X3, Y, odds1, odds2, fighter_names, opponent_names = data
    X1, X2, X3, Y, odds1, odds2, model = (
        X1.to(device),
        X2.to(device),
        X3.to(device),
        Y.to(device),
        odds1.to(device),
        odds2.to(device),
        model.to(device),
    )
    stats = []

    with torch.no_grad():
        predictions_1 = (
            model(X1, X2, X3, odds1.reshape(-1, 1), odds2.reshape(-1, 1))
            .detach()
            .cpu()
            .numpy()
            .reshape(-1)
        )
        predictions_2 = 1 - model(
            X2, X1, X3, odds2.reshape(-1, 1), odds1.reshape(-1, 1)
        ).detach().cpu().numpy().reshape(-1)

        predictions = 0.5 * (predictions_1 + predictions_2)
        shifts = abs(predictions_2 - predictions_1)

        corrects = predictions.round() == Y.cpu().numpy()

        invested = 0
        earnings = 0
        fights = 0
        nbets = 0

        invest_progress = []
        earning_progress = []

        for fighter, opponent, prediction, shift, odd1, odd2, correct, Yi in zip(
            fighter_names,
            opponent_names,
            predictions,
            shifts,
            odds1.cpu().numpy().reshape(-1),
            odds2.cpu().numpy().reshape(-1),
            corrects,
            Y.cpu().numpy().tolist(),
        ):
            prediction = round(float(prediction), 3)
            shift = round(float(shift), 3)

            if prediction > 0.5:
                bet = 2 * 10 * (prediction - 0.5)
                earning = odd2 * bet if correct else 0
            else:
                bet = 2 * 10 * (0.5 - prediction)
                earning = odd1 * bet if correct else 0

            invested += bet
            earnings += earning

            invest_progress.append(bet)
            earning_progress.append(earning)

            fights += 1
            nbets += 1

            if print_info:  # pragma: no cover
                print(fighter, "vs", opponent)
                print(odd1, "vs", odd2)
                print(prediction, shift)

                print(f"bet: {bet:.2f}, earn: {earning:.2f}")
                print(
                    f"invested: {invested:.2f}, earnings: {earnings:.2f}, nbets: {nbets}, fights: {fights}"
                )
                print(f"benefits: {(earnings/invested-1)*100:.2f}%")

                print()
            stats.append(
                (
                    prediction,
                    Yi,
                    odd1,
                    odd2,
                    correct,
                    bet,
                    earning,
                )
            )

    if show_plot:
        if ax is None:  # pragma: no cover
            fig, ax = plt.subplots()

        ax.plot(
            np.cumsum(invest_progress),
            (np.cumsum(earning_progress) - np.cumsum(invest_progress))
            / np.cumsum(invest_progress)
            * 100,
        )
        ax.axhline(0, c="k")

    return stats

`show_fight_prediction_detail_from_dataset(model, dataset, fight_ids=None, print_info=False, show_plot=False, ax=None, device='cpu')` `staticmethod`

Shows the prediction detail of a fight and the benefit of the model.

It uses the dataset to get the data for the specified fight ids. It then calls show_fight_prediction_detail with the model and the data.

Parameters:

Name	Description	Default
`model`	The model to use to make predictions.	required
`dataset`	The dataset to use to get the data.	required
`fight_ids`	The id of the fight to use. If None, it will use all the data in the dataset.	`None`
`print_info`	If True, print the prediction, shift, odd1, odd2, and correct for each fight. If False, do not print anything.	`False`
`show_plot`	If True, show a plot of the benefit of the model over time.	`False`
`ax`	The axes to use to show the plot. If None, a new figure will be created.	`None`

Source code in ufcpredictor/plot_tools.py

@staticmethod
def show_fight_prediction_detail_from_dataset(
    model: nn.Module,
    dataset: BasicDataset,
    fight_ids: Optional[List[str]] = None,
    print_info: bool = False,
    show_plot: bool = False,
    ax: Optional[plt.Axes] = None,
    device: str = "cpu",
) -> List[Tuple[float, int, float, float, bool, float, float, str]]:
    """
    Shows the prediction detail of a fight and the benefit of the model.

    It uses the dataset to get the data for the specified fight ids.
    It then calls show_fight_prediction_detail with the model and the data.

    Args:
        model : The model to use to make predictions.
        dataset : The dataset to use to get the data.
        fight_ids : The id of the fight to use. If None, it will use all the data
            in the dataset.
        print_info : If True, print the prediction, shift, odd1, odd2, and correct
            for each fight. If False, do not print anything.
        show_plot : If True, show a plot of the benefit of the model over time.
        ax : The axes to use to show the plot. If None, a new figure will be
            created.
    """
    X1, X2, X3, Y, odds1, odds2, fighter_names, opponent_names = (
        dataset.get_fight_data_from_ids(fight_ids)
    )

    stats = PredictionPlots.show_fight_prediction_detail(
        model,
        (X1, X2, X3, Y, odds1, odds2, fighter_names, opponent_names),
        print_info,
        show_plot,
        ax,
        device=device,
    )

    return [
        fight_stats + (fight_id,)
        for fight_stats, fight_id in zip(
            stats, dataset.fight_data["fight_id"].values
        )
    ]

Utils

Utility functions for the UFC predictor project.

This module contains various utility functions used throughout the project, including functions for converting between different time and odds formats, as well as other miscellaneous helper functions.

`convert_minutes_to_seconds(time_str)`

Convert a time string from minutes:seconds format to seconds.

Parameters:

Name	Type	Description	Default
`time_str`	`str`	Time string in minutes:seconds format.	required

Returns:

Type	Description
`Optional[int]`	Time in seconds. If the input string is "--", returns 0. If the input is None or "NULL", or if the input is NaN, returns None.

Source code in ufcpredictor/utils.py

def convert_minutes_to_seconds(time_str: str) -> Optional[int]:
    """
    Convert a time string from minutes:seconds format to seconds.

    Args:
        time_str: Time string in minutes:seconds format.

    Returns:
        Time in seconds. If the input string is "--", returns 0. If the input is None
            or "NULL", or if the input is NaN, returns None.
    """
    if time_str == "--":
        return 0
    elif time_str in (None, "NULL") or pd.isna(time_str):
        return None
    else:
        minutes, seconds = map(int, time_str.split(":"))
        return minutes * 60 + seconds

`convert_odds_to_decimal(odds)`

Convert odds from American format to decimal format.

Parameters:

Name	Type	Description	Default
`odds`	`List[int \| float] \| NDArray[float64 \| int_]`	Odds in American format.	required

Returns:

Type	Description
`NDArray[float64]`	Odds in decimal format.

Source code in ufcpredictor/utils.py

def convert_odds_to_decimal(
    odds: List[int | float] | NDArray[np.float64 | np.int_],
) -> NDArray[np.float64]:
    """
    Convert odds from American format to decimal format.

    Args:
        odds: Odds in American format.

    Returns:
        Odds in decimal format.
    """
    if not isinstance(odds, np.ndarray):
        odds = np.asarray(odds, dtype=np.float64)
    else:
        odds = odds.astype(np.float64)

    msk = odds > 0

    odds[msk] = odds[msk] / 100 + 1
    odds[~msk] = 100 / -odds[~msk] + 1

    return odds

`convert_odds_to_moneyline(odds)`

Convert odds from decimal format to moneyline format.

Parameters:

Name	Type	Description	Default
`odds`	`NDArray[float64] \| List[float]`	Odds in decimal format.	required

Returns:

Type	Description
`NDArray[int_]`	Odds in moneyline format.

Source code in ufcpredictor/utils.py

def convert_odds_to_moneyline(
    odds: NDArray[np.float64] | List[float],
) -> NDArray[np.int_]:
    """
    Convert odds from decimal format to moneyline format.

    Args:
        odds: Odds in decimal format.

    Returns:
        Odds in moneyline format.
    """
    if not isinstance(odds, np.ndarray):
        odds = np.asarray(odds, dtype=np.float64)

    msk = odds > 2

    odds[msk] = (odds[msk] - 1) * 100
    odds[~msk] = 100 / (1 - odds[~msk])

    return np.round(odds).astype(int)

Welcome UFCPredictor documentation

Data processor

DataProcessor

aggregated_fields: List[str] property

normalized_fields: List[str] property

round_stat_names: List[str] property

stat_names: List[str] property

__init__(data_folder=None, ufc_scraper=None, bfo_scraper=None, data_aggregator=None, data_enhancers=[])

add_key_stats(data) staticmethod

add_per_minute_and_fight_stats()

aggregate_data()

apply_filters(data) staticmethod

convert_odds_to_decimal(data) staticmethod

fill_weight(data) staticmethod

fix_date_and_time_fields(data) staticmethod

get_fighter_id(name)

get_fighter_name(id_)

group_round_data(data)

join_dataframes()

load_data()

normalize_data()

Datasets

BasicDataset

__getitem__(idx)

__init__(data_processor, fight_ids, X_set=None, Xf_set=None)

__len__()

get_fight_data_from_ids(fight_ids=None)

load_data()

ForecastDataset

__init__(data_processor, X_set=None, Xf_set=None)

get_forecast_prediction(fighter_names, opponent_names, event_dates, fighter_odds, opponent_odds, model, fight_features=[], parse_ids=False, device='cpu')

get_single_forecast_prediction(fighter_name, opponent_name, event_date, odds1, odds2, model, fight_features=[], parse_ids=False)

Loss Functions

BettingLoss

__init__(max_bet=10)

forward(predictions, targets, odds_1, odds_2)

get_bet(prediction)

Models

FighterNet

__init__(input_size, dropout_prob=0.0, network_shape=[128, 256, 512, 256, 127])

forward(x)

SimpleFightNet

__init__(input_size, dropout_prob=0.0, network_shape=[1024, 512, 256, 128, 64, 1])

forward(X1, X2, X3, odds1, odds2)

SymmetricFightNet

__init__(input_size, input_size_f, dropout_prob=0.0, network_shape=[512, 128, 64, 1], fighter_network_shape=None)

forward(X1, X2, X3, odds1, odds2)

Trainer

Trainer

__init__(train_loader, model, optimizer, loss_fn, test_loader=None, scheduler=None, device='cpu', mlflow_tracking=False)

test(test_loader=None, silent=False)

train(train_loader=None, test_loader=None, epochs=10, silent=False)

Plot tools

PredictionPlots

plot_single_prediction(model, dataset, fighter_name, opponent_name, fight_features, event_date, odds1, odds2, ax=None, parse_id=False) staticmethod

show_fight_prediction_detail(model, data, print_info=False, show_plot=False, ax=None, device='cpu') staticmethod

show_fight_prediction_detail_from_dataset(model, dataset, fight_ids=None, print_info=False, show_plot=False, ax=None, device='cpu') staticmethod

Utils

convert_minutes_to_seconds(time_str)

convert_odds_to_decimal(odds)

convert_odds_to_moneyline(odds)

`DataProcessor`

`aggregated_fields: List[str]` `property`

`normalized_fields: List[str]` `property`

`round_stat_names: List[str]` `property`

`stat_names: List[str]` `property`

`init(data_folder=None, ufc_scraper=None, bfo_scraper=None, data_aggregator=None, data_enhancers=[])`

`add_key_stats(data)` `staticmethod`

`add_per_minute_and_fight_stats()`

`aggregate_data()`

`apply_filters(data)` `staticmethod`

`convert_odds_to_decimal(data)` `staticmethod`

`fill_weight(data)` `staticmethod`

`fix_date_and_time_fields(data)` `staticmethod`

`get_fighter_id(name)`

`get_fighter_name(id_)`

`group_round_data(data)`

`join_dataframes()`

`load_data()`

`normalize_data()`

`BasicDataset`

`getitem(idx)`

`init(data_processor, fight_ids, X_set=None, Xf_set=None)`

`len()`

`get_fight_data_from_ids(fight_ids=None)`

`load_data()`

`ForecastDataset`

`init(data_processor, X_set=None, Xf_set=None)`

`get_forecast_prediction(fighter_names, opponent_names, event_dates, fighter_odds, opponent_odds, model, fight_features=[], parse_ids=False, device='cpu')`

`get_single_forecast_prediction(fighter_name, opponent_name, event_date, odds1, odds2, model, fight_features=[], parse_ids=False)`

`BettingLoss`

`init(max_bet=10)`

`forward(predictions, targets, odds_1, odds_2)`

`get_bet(prediction)`

`FighterNet`

`init(input_size, dropout_prob=0.0, network_shape=[128, 256, 512, 256, 127])`

`forward(x)`

`SimpleFightNet`

`init(input_size, dropout_prob=0.0, network_shape=[1024, 512, 256, 128, 64, 1])`

`forward(X1, X2, X3, odds1, odds2)`

`SymmetricFightNet`

`init(input_size, input_size_f, dropout_prob=0.0, network_shape=[512, 128, 64, 1], fighter_network_shape=None)`

`forward(X1, X2, X3, odds1, odds2)`

`Trainer`

`init(train_loader, model, optimizer, loss_fn, test_loader=None, scheduler=None, device='cpu', mlflow_tracking=False)`

`test(test_loader=None, silent=False)`

`train(train_loader=None, test_loader=None, epochs=10, silent=False)`

`PredictionPlots`

`plot_single_prediction(model, dataset, fighter_name, opponent_name, fight_features, event_date, odds1, odds2, ax=None, parse_id=False)` `staticmethod`

`show_fight_prediction_detail(model, data, print_info=False, show_plot=False, ax=None, device='cpu')` `staticmethod`

`show_fight_prediction_detail_from_dataset(model, dataset, fight_ids=None, print_info=False, show_plot=False, ax=None, device='cpu')` `staticmethod`

`convert_minutes_to_seconds(time_str)`

`convert_odds_to_decimal(odds)`

`convert_odds_to_moneyline(odds)`