From bb798133097a12dd7a6deed4092b096dfc1cd316 Mon Sep 17 00:00:00 2001 From: Google APIs Date: Mon, 13 May 2019 14:10:00 -0700 Subject: Synchronize new proto/yaml changes. PiperOrigin-RevId: 248006867 --- google/cloud/speech/v1/cloud_speech.proto | 142 +++++++++++++++++++++++++++++- 1 file changed, 140 insertions(+), 2 deletions(-) (limited to 'google') diff --git a/google/cloud/speech/v1/cloud_speech.proto b/google/cloud/speech/v1/cloud_speech.proto index 11bd981b8..0966c11c3 100644 --- a/google/cloud/speech/v1/cloud_speech.proto +++ b/google/cloud/speech/v1/cloud_speech.proto @@ -19,9 +19,7 @@ package google.cloud.speech.v1; import "google/api/annotations.proto"; import "google/longrunning/operations.proto"; -import "google/protobuf/any.proto"; import "google/protobuf/duration.proto"; -import "google/protobuf/empty.proto"; import "google/protobuf/timestamp.proto"; import "google/rpc/status.proto"; @@ -278,6 +276,9 @@ message RecognitionConfig { // premium feature. bool enable_automatic_punctuation = 11; + // *Optional* Metadata regarding this request. + RecognitionMetadata metadata = 9; + // *Optional* Which model to select for the given request. Select the model // best suited to your domain to get best results. If a model is not // explicitly specified, then we auto-select a model based on the parameters @@ -330,6 +331,133 @@ message RecognitionConfig { bool use_enhanced = 14; } +// Description of audio data to be recognized. +message RecognitionMetadata { + // Use case categories that the audio recognition request can be described + // by. + enum InteractionType { + // Use case is either unknown or is something other than one of the other + // values below. + INTERACTION_TYPE_UNSPECIFIED = 0; + + // Multiple people in a conversation or discussion. For example in a + // meeting with two or more people actively participating. Typically + // all the primary people speaking would be in the same room (if not, + // see PHONE_CALL) + DISCUSSION = 1; + + // One or more persons lecturing or presenting to others, mostly + // uninterrupted. + PRESENTATION = 2; + + // A phone-call or video-conference in which two or more people, who are + // not in the same room, are actively participating. + PHONE_CALL = 3; + + // A recorded message intended for another person to listen to. + VOICEMAIL = 4; + + // Professionally produced audio (eg. TV Show, Podcast). + PROFESSIONALLY_PRODUCED = 5; + + // Transcribe spoken questions and queries into text. + VOICE_SEARCH = 6; + + // Transcribe voice commands, such as for controlling a device. + VOICE_COMMAND = 7; + + // Transcribe speech to text to create a written document, such as a + // text-message, email or report. + DICTATION = 8; + } + + // The use case most closely describing the audio content to be recognized. + InteractionType interaction_type = 1; + + // The industry vertical to which this speech recognition request most + // closely applies. This is most indicative of the topics contained + // in the audio. Use the 6-digit NAICS code to identify the industry + // vertical - see https://www.naics.com/search/. + uint32 industry_naics_code_of_audio = 3; + + // Enumerates the types of capture settings describing an audio file. + enum MicrophoneDistance { + // Audio type is not known. + MICROPHONE_DISTANCE_UNSPECIFIED = 0; + + // The audio was captured from a closely placed microphone. Eg. phone, + // dictaphone, or handheld microphone. Generally if there speaker is within + // 1 meter of the microphone. + NEARFIELD = 1; + + // The speaker if within 3 meters of the microphone. + MIDFIELD = 2; + + // The speaker is more than 3 meters away from the microphone. + FARFIELD = 3; + } + + // The audio type that most closely describes the audio being recognized. + MicrophoneDistance microphone_distance = 4; + + // The original media the speech was recorded on. + enum OriginalMediaType { + // Unknown original media type. + ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0; + + // The speech data is an audio recording. + AUDIO = 1; + + // The speech data originally recorded on a video. + VIDEO = 2; + } + + // The original media the speech was recorded on. + OriginalMediaType original_media_type = 5; + + // The type of device the speech was recorded with. + enum RecordingDeviceType { + // The recording device is unknown. + RECORDING_DEVICE_TYPE_UNSPECIFIED = 0; + + // Speech was recorded on a smartphone. + SMARTPHONE = 1; + + // Speech was recorded using a personal computer or tablet. + PC = 2; + + // Speech was recorded over a phone line. + PHONE_LINE = 3; + + // Speech was recorded in a vehicle. + VEHICLE = 4; + + // Speech was recorded outdoors. + OTHER_OUTDOOR_DEVICE = 5; + + // Speech was recorded indoors. + OTHER_INDOOR_DEVICE = 6; + } + + // The type of device the speech was recorded with. + RecordingDeviceType recording_device_type = 6; + + // The device used to make the recording. Examples 'Nexus 5X' or + // 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or + // 'Cardioid Microphone'. + string recording_device_name = 7; + + // Mime type of the original audio file. For example `audio/m4a`, + // `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`. + // A list of possible audio mime types is maintained at + // http://www.iana.org/assignments/media-types/media-types.xhtml#audio + string original_mime_type = 8; + + // Description of the content. Eg. "Recordings of federal supreme court + // hearings from 2012". + string audio_topic = 10; +} + // Provides "hints" to the speech recognizer to favor specific words and phrases // in the results. message SpeechContext { @@ -504,10 +632,20 @@ message StreamingRecognitionResult { // The default of 0.0 is a sentinel value indicating `stability` was not set. float stability = 3; + // Output only. Time offset of the end of this result relative to the + // beginning of the audio. + google.protobuf.Duration result_end_time = 4; + // For multi-channel audio, this is the channel number corresponding to the // recognized result for the audio from that channel. // For audio_channel_count = N, its output values can range from '1' to 'N'. int32 channel_tag = 5; + + // Output only. The + // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the + // language in this result. This language code was detected to have the most + // likelihood of being spoken in the audio. + string language_code = 6; } // A speech recognition result corresponding to a portion of the audio. -- cgit v1.2.3