aboutsummaryrefslogtreecommitdiff
path: root/google
diff options
context:
space:
mode:
authorGoogle APIs <noreply@google.com>2019-05-13 14:10:00 -0700
committerCopybara-Service <copybara-worker@google.com>2019-05-13 14:10:25 -0700
commitbb798133097a12dd7a6deed4092b096dfc1cd316 (patch)
treee35ec5fc96b679cd39dac4de953d75a00988eb40 /google
parentaa2542389436aabe91ed26e2390039f81af32d00 (diff)
downloadgoogleapis-bb798133097a12dd7a6deed4092b096dfc1cd316.tar.gz
Synchronize new proto/yaml changes.
PiperOrigin-RevId: 248006867
Diffstat (limited to 'google')
-rw-r--r--google/cloud/speech/v1/cloud_speech.proto142
1 files changed, 140 insertions, 2 deletions
diff --git a/google/cloud/speech/v1/cloud_speech.proto b/google/cloud/speech/v1/cloud_speech.proto
index 11bd981b8..0966c11c3 100644
--- a/google/cloud/speech/v1/cloud_speech.proto
+++ b/google/cloud/speech/v1/cloud_speech.proto
@@ -19,9 +19,7 @@ package google.cloud.speech.v1;
import "google/api/annotations.proto";
import "google/longrunning/operations.proto";
-import "google/protobuf/any.proto";
import "google/protobuf/duration.proto";
-import "google/protobuf/empty.proto";
import "google/protobuf/timestamp.proto";
import "google/rpc/status.proto";
@@ -278,6 +276,9 @@ message RecognitionConfig {
// premium feature.
bool enable_automatic_punctuation = 11;
+ // *Optional* Metadata regarding this request.
+ RecognitionMetadata metadata = 9;
+
// *Optional* Which model to select for the given request. Select the model
// best suited to your domain to get best results. If a model is not
// explicitly specified, then we auto-select a model based on the parameters
@@ -330,6 +331,133 @@ message RecognitionConfig {
bool use_enhanced = 14;
}
+// Description of audio data to be recognized.
+message RecognitionMetadata {
+ // Use case categories that the audio recognition request can be described
+ // by.
+ enum InteractionType {
+ // Use case is either unknown or is something other than one of the other
+ // values below.
+ INTERACTION_TYPE_UNSPECIFIED = 0;
+
+ // Multiple people in a conversation or discussion. For example in a
+ // meeting with two or more people actively participating. Typically
+ // all the primary people speaking would be in the same room (if not,
+ // see PHONE_CALL)
+ DISCUSSION = 1;
+
+ // One or more persons lecturing or presenting to others, mostly
+ // uninterrupted.
+ PRESENTATION = 2;
+
+ // A phone-call or video-conference in which two or more people, who are
+ // not in the same room, are actively participating.
+ PHONE_CALL = 3;
+
+ // A recorded message intended for another person to listen to.
+ VOICEMAIL = 4;
+
+ // Professionally produced audio (eg. TV Show, Podcast).
+ PROFESSIONALLY_PRODUCED = 5;
+
+ // Transcribe spoken questions and queries into text.
+ VOICE_SEARCH = 6;
+
+ // Transcribe voice commands, such as for controlling a device.
+ VOICE_COMMAND = 7;
+
+ // Transcribe speech to text to create a written document, such as a
+ // text-message, email or report.
+ DICTATION = 8;
+ }
+
+ // The use case most closely describing the audio content to be recognized.
+ InteractionType interaction_type = 1;
+
+ // The industry vertical to which this speech recognition request most
+ // closely applies. This is most indicative of the topics contained
+ // in the audio. Use the 6-digit NAICS code to identify the industry
+ // vertical - see https://www.naics.com/search/.
+ uint32 industry_naics_code_of_audio = 3;
+
+ // Enumerates the types of capture settings describing an audio file.
+ enum MicrophoneDistance {
+ // Audio type is not known.
+ MICROPHONE_DISTANCE_UNSPECIFIED = 0;
+
+ // The audio was captured from a closely placed microphone. Eg. phone,
+ // dictaphone, or handheld microphone. Generally if there speaker is within
+ // 1 meter of the microphone.
+ NEARFIELD = 1;
+
+ // The speaker if within 3 meters of the microphone.
+ MIDFIELD = 2;
+
+ // The speaker is more than 3 meters away from the microphone.
+ FARFIELD = 3;
+ }
+
+ // The audio type that most closely describes the audio being recognized.
+ MicrophoneDistance microphone_distance = 4;
+
+ // The original media the speech was recorded on.
+ enum OriginalMediaType {
+ // Unknown original media type.
+ ORIGINAL_MEDIA_TYPE_UNSPECIFIED = 0;
+
+ // The speech data is an audio recording.
+ AUDIO = 1;
+
+ // The speech data originally recorded on a video.
+ VIDEO = 2;
+ }
+
+ // The original media the speech was recorded on.
+ OriginalMediaType original_media_type = 5;
+
+ // The type of device the speech was recorded with.
+ enum RecordingDeviceType {
+ // The recording device is unknown.
+ RECORDING_DEVICE_TYPE_UNSPECIFIED = 0;
+
+ // Speech was recorded on a smartphone.
+ SMARTPHONE = 1;
+
+ // Speech was recorded using a personal computer or tablet.
+ PC = 2;
+
+ // Speech was recorded over a phone line.
+ PHONE_LINE = 3;
+
+ // Speech was recorded in a vehicle.
+ VEHICLE = 4;
+
+ // Speech was recorded outdoors.
+ OTHER_OUTDOOR_DEVICE = 5;
+
+ // Speech was recorded indoors.
+ OTHER_INDOOR_DEVICE = 6;
+ }
+
+ // The type of device the speech was recorded with.
+ RecordingDeviceType recording_device_type = 6;
+
+ // The device used to make the recording. Examples 'Nexus 5X' or
+ // 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
+ // 'Cardioid Microphone'.
+ string recording_device_name = 7;
+
+ // Mime type of the original audio file. For example `audio/m4a`,
+ // `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
+ // A list of possible audio mime types is maintained at
+ // http://www.iana.org/assignments/media-types/media-types.xhtml#audio
+ string original_mime_type = 8;
+
+ // Description of the content. Eg. "Recordings of federal supreme court
+ // hearings from 2012".
+ string audio_topic = 10;
+}
+
// Provides "hints" to the speech recognizer to favor specific words and phrases
// in the results.
message SpeechContext {
@@ -504,10 +632,20 @@ message StreamingRecognitionResult {
// The default of 0.0 is a sentinel value indicating `stability` was not set.
float stability = 3;
+ // Output only. Time offset of the end of this result relative to the
+ // beginning of the audio.
+ google.protobuf.Duration result_end_time = 4;
+
// For multi-channel audio, this is the channel number corresponding to the
// recognized result for the audio from that channel.
// For audio_channel_count = N, its output values can range from '1' to 'N'.
int32 channel_tag = 5;
+
+ // Output only. The
+ // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
+ // language in this result. This language code was detected to have the most
+ // likelihood of being spoken in the audio.
+ string language_code = 6;
}
// A speech recognition result corresponding to a portion of the audio.