> ## Documentation Index
> Fetch the complete documentation index at: https://docs.obiguard.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Create Transcription


## OpenAPI

````yaml post /audio/transcriptions
openapi: 3.0.0
info:
  title: Obiguard API
  description: >-
    The Obiguard REST API. Please see
    https://docs.obiguard.ai/docs/api-reference for more details.
  version: 1.0.0
  termsOfService: https://obiguard.ai/terms
servers:
  - url: https://gateway.obiguard.ai/v1
security: []
tags:
  - name: Audio
    description: Turn audio into text or text into audio.
  - name: Chat
    description: >-
      Given a list of messages comprising a conversation, the model will return
      a response.
  - name: Completions
    description: >-
      Given a prompt, the model will return one or more predicted completions,
      and can also return the probabilities of alternative tokens at each
      position.
  - name: Embeddings
    description: >-
      Get a vector representation of a given input that can be easily consumed
      by machine learning models and algorithms.
  - name: Fine-tuning
    description: Manage fine-tuning jobs to tailor a model to your specific training data.
  - name: Files
    description: >-
      Files are used to upload documents that can be used with features like
      Assistants and Fine-tuning.
  - name: Images
    description: Given a prompt and/or an input image, the model will generate a new image.
paths:
  /audio/transcriptions:
    post:
      tags:
        - Audio
      summary: Create Transcription
      operationId: createTranscription
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              $ref: '#/components/schemas/CreateTranscriptionRequest'
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                oneOf:
                  - $ref: '#/components/schemas/CreateTranscriptionResponseJson'
                  - $ref: >-
                      #/components/schemas/CreateTranscriptionResponseVerboseJson
      security:
        - Obiguard-API-Key: []
      servers:
        - url: https://gateway.obiguard.ai/v1
      x-code-samples:
        - lang: curl
          label: Default
          source: |
            curl https://gateway.obiguard.ai/v1/audio/transcriptions \
              -H "x-obiguard-api-key: $OBIGUARD_API_KEY" \
              -H "Content-Type: multipart/form-data" \
              -F file="@/path/to/file/audio.mp3" \
              -F model="whisper-1"
        - lang: python
          label: Default
          source: |
            from obiguard import Obiguard

            client = Obiguard(
              obiguard_api_key="vk-obg***",  # Your Obiguard virtual key
            )

            audio_file = open("speech.mp3", "rb")
            transcript = client.audio.transcriptions.create(
              model="whisper-1",
              file=audio_file
            )
        - lang: curl
          label: Self-Hosted
          source: |
            curl -X POST "gateway.obiguard.ai/v1/audio/transcriptions" \
              -H "Content-Type: application/json" \
              -H "x-obiguard-api-key: $OBIGUARD_API_KEY" \
              -d '{
                "model": "whisper-1",
                "file": "@/path/to/file/audio.mp3"
              }' \
              --output transcription.json
components:
  schemas:
    CreateTranscriptionRequest:
      type: object
      additionalProperties: false
      properties:
        file:
          description: >
            The audio file object (not file name) to transcribe, in one of these
            formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
          type: string
          x-oaiTypeLabel: file
          format: binary
        model:
          description: >
            ID of the model to use. Only `whisper-1` (which is powered by our
            open source Whisper V2 model) is currently available.
          example: whisper-1
          anyOf:
            - type: string
            - type: string
              enum:
                - whisper-1
          x-oaiTypeLabel: string
        language:
          description: >
            The language of the input audio. Supplying the input language in
            [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)
            format will improve accuracy and latency.
          type: string
        prompt:
          description: >
            An optional text to guide the model's style or continue a previous
            audio segment. The
            [prompt](https://platform.openai.com/docs/guides/speech-to-text/prompting)
            should match the audio language.
          type: string
        response_format:
          description: >
            The format of the transcript output, in one of these options:
            `json`, `text`, `srt`, `verbose_json`, or `vtt`.
          type: string
          enum:
            - json
            - text
            - srt
            - verbose_json
            - vtt
          default: json
        temperature:
          description: >
            The sampling temperature, between 0 and 1. Higher values like 0.8
            will make the output more random, while lower values like 0.2 will
            make it more focused and deterministic. If set to 0, the model will
            use [log probability](https://en.wikipedia.org/wiki/Log_probability)
            to automatically increase the temperature until certain thresholds
            are hit.
          type: number
          default: 0
        timestamp_granularities[]:
          description: >
            The timestamp granularities to populate for this transcription.
            `response_format` must be set `verbose_json` to use timestamp
            granularities. Either or both of these options are supported:
            `word`, or `segment`. Note: There is no additional latency for
            segment timestamps, but generating word timestamps incurs additional
            latency.
          type: array
          items:
            type: string
            enum:
              - word
              - segment
          default:
            - segment
      required:
        - file
        - model
    CreateTranscriptionResponseJson:
      type: object
      description: >-
        Represents a transcription response returned by model, based on the
        provided input.
      properties:
        text:
          type: string
          description: The transcribed text.
      required:
        - text
    CreateTranscriptionResponseVerboseJson:
      type: object
      description: >-
        Represents a verbose json transcription response returned by model,
        based on the provided input.
      properties:
        language:
          type: string
          description: The language of the input audio.
        duration:
          type: string
          description: The duration of the input audio.
        text:
          type: string
          description: The transcribed text.
        words:
          type: array
          description: Extracted words and their corresponding timestamps.
          items:
            $ref: '#/components/schemas/TranscriptionWord'
        segments:
          type: array
          description: Segments of the transcribed text and their corresponding details.
          items:
            $ref: '#/components/schemas/TranscriptionSegment'
      required:
        - language
        - duration
        - text
    TranscriptionWord:
      type: object
      properties:
        word:
          type: string
          description: The text content of the word.
        start:
          type: number
          format: float
          description: Start time of the word in seconds.
        end:
          type: number
          format: float
          description: End time of the word in seconds.
      required:
        - word
        - start
        - end
    TranscriptionSegment:
      type: object
      properties:
        id:
          type: integer
          description: Unique identifier of the segment.
        seek:
          type: integer
          description: Seek offset of the segment.
        start:
          type: number
          format: float
          description: Start time of the segment in seconds.
        end:
          type: number
          format: float
          description: End time of the segment in seconds.
        text:
          type: string
          description: Text content of the segment.
        tokens:
          type: array
          items:
            type: integer
          description: Array of token IDs for the text content.
        temperature:
          type: number
          format: float
          description: Temperature parameter used for generating the segment.
        avg_logprob:
          type: number
          format: float
          description: >-
            Average logprob of the segment. If the value is lower than -1,
            consider the logprobs failed.
        compression_ratio:
          type: number
          format: float
          description: >-
            Compression ratio of the segment. If the value is greater than 2.4,
            consider the compression failed.
        no_speech_prob:
          type: number
          format: float
          description: >-
            Probability of no speech in the segment. If the value is higher than
            1.0 and the `avg_logprob` is below -1, consider this segment silent.
      required:
        - id
        - seek
        - start
        - end
        - text
        - tokens
        - temperature
        - avg_logprob
        - compression_ratio
        - no_speech_prob
  securitySchemes:
    Obiguard-API-Key:
      type: apiKey
      in: header
      name: x-obiguard-api-key

````