Blog>
Snippets

Tokenizing Input Text for RAG using TensorFlow and Hugging Face

Showcase how to tokenize input text before sending it through a Retrieval Augmented Generation model using TensorFlow and transformers library from Hugging Face.
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Tokenize Input Text for RAG</title>
    <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs"></script>
    <script src="https://cdn.jsdelivr.net/npm/@tensorflow-models/qna"></script>
    <script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs-converter"></script>
</head>
<body>

    <textarea id="input-text" placeholder="Enter text to tokenize..."></textarea>
    <button onclick="tokenizeText()">Tokenize Text</button>

    <script src="https://cdn.jsdelivr.net/npm/@huggingface/tokenizers"></script>
    <script>
        // Load the tokenizer
        async function loadTokenizer() {
            const modelUrl = 'MODEL_URL_HERE'; // Add the model url for the tokenizer
            const tokenizer = await tokenizers.Tokenizer.fromPretrained(modelUrl);
            return tokenizer;
        }

        // Tokenize input text when button is clicked
        async function tokenizeText() {
            const tokenizer = await loadTokenizer();
            const inputText = document.getElementById('input-text').value;
            const encodings = await tokenizer.encode(inputText);
            console.log('Tokenized input:', encodings);
        }
    </script>

</body>
</html>
This code represents an HTML document which includes a text area for users to input text, and a button to trigger text tokenization. Two external JavaScript libraries, TensorFlow.js and tokenizers from Hugging Face, are included for the tokenization process. When the user clicks the 'Tokenize Text' button, the tokenizeText function is called which retrieves and tokenizes the text using the tokenizers library from Hugging Face.