Skip to content

VLLMModelInterface

VLLMInferenceModel

Bases: BaseInferenceModel

VLLM inference model interface. This class extends the BaseInferenceModel to provide specific functionality for VLLM.

Source code in easyroutine/inference/vllm_model_interface.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
class VLLMInferenceModel(BaseInferenceModel):
    """
    VLLM inference model interface.
    This class extends the BaseInferenceModel to provide specific functionality for VLLM.
    """

    def __init__(self, config: BaseInferenceModelConfig):
        super().__init__(config)
        self.model = LLM(model=config.model_name, tensor_parallel_size=config.n_gpus, dtype=config.dtype)


    def convert_chat_messages_to_custom_format(self, chat_messages: List[dict[str, str]]) -> List[dict[str, str]]:
        """
        For now, VLLM is compatible with the chat template format we use.
        """
        return chat_messages

    def chat(self, chat_messages: List[dict[str, str]], use_tqdm=False, **kwargs) -> list:
        """
        Generate a response based on the provided chat messages.

        Arguments:
            chat_messages (List[dict[str, str]]): List of chat messages to process.
            **kwargs: Additional parameters for the model.

        Returns:
            str: The generated response from the model.
        """
        chat_messages = self.convert_chat_messages_to_custom_format(chat_messages)

        sampling_params = SamplingParams(
            temperature=self.config.temperature,
            top_p=self.config.top_p,
            max_tokens=self.config.max_new_tokens
        )


        # Generate response using VLLM
        response = self.model.chat(chat_messages, sampling_params=sampling_params, use_tqdm=use_tqdm) # type: ignore

        return response

chat(chat_messages, use_tqdm=False, **kwargs)

Generate a response based on the provided chat messages.

Parameters:

Name Type Description Default
chat_messages List[dict[str, str]]

List of chat messages to process.

required
**kwargs

Additional parameters for the model.

{}

Returns:

Name Type Description
str list

The generated response from the model.

Source code in easyroutine/inference/vllm_model_interface.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def chat(self, chat_messages: List[dict[str, str]], use_tqdm=False, **kwargs) -> list:
    """
    Generate a response based on the provided chat messages.

    Arguments:
        chat_messages (List[dict[str, str]]): List of chat messages to process.
        **kwargs: Additional parameters for the model.

    Returns:
        str: The generated response from the model.
    """
    chat_messages = self.convert_chat_messages_to_custom_format(chat_messages)

    sampling_params = SamplingParams(
        temperature=self.config.temperature,
        top_p=self.config.top_p,
        max_tokens=self.config.max_new_tokens
    )


    # Generate response using VLLM
    response = self.model.chat(chat_messages, sampling_params=sampling_params, use_tqdm=use_tqdm) # type: ignore

    return response

convert_chat_messages_to_custom_format(chat_messages)

For now, VLLM is compatible with the chat template format we use.

Source code in easyroutine/inference/vllm_model_interface.py
22
23
24
25
26
def convert_chat_messages_to_custom_format(self, chat_messages: List[dict[str, str]]) -> List[dict[str, str]]:
    """
    For now, VLLM is compatible with the chat template format we use.
    """
    return chat_messages

VLLMInferenceModelConfig dataclass

Bases: BaseInferenceModelConfig

just a placeholder for now, as we don't have any specific config for VLLM.

Source code in easyroutine/inference/vllm_model_interface.py
7
8
9
@dataclass
class VLLMInferenceModelConfig(BaseInferenceModelConfig):
    """just a placeholder for now, as we don't have any specific config for VLLM."""