diff --git a/assemblyai/__version__.py b/assemblyai/__version__.py index 647040d..c60bb1b 100644 --- a/assemblyai/__version__.py +++ b/assemblyai/__version__.py @@ -1 +1 @@ -__version__ = "0.62.0" +__version__ = "0.63.0" diff --git a/assemblyai/transcriber.py b/assemblyai/transcriber.py index 9a319d4..ef94bac 100644 --- a/assemblyai/transcriber.py +++ b/assemblyai/transcriber.py @@ -491,6 +491,30 @@ def utterances(self) -> Optional[List[types.Utterance]]: return self._impl.transcript.utterances + @property + def unredacted_text(self) -> Optional[str]: + "The unredacted transcript text, when `redact_pii_return_unredacted` was enabled." + if not self._impl.transcript: + raise ValueError("The internal Transcript object is None.") + + return self._impl.transcript.unredacted_text + + @property + def unredacted_words(self) -> Optional[List[types.Word]]: + "The unredacted list of words, when `redact_pii_return_unredacted` was enabled." + if not self._impl.transcript: + raise ValueError("The internal Transcript object is None.") + + return self._impl.transcript.unredacted_words + + @property + def unredacted_utterances(self) -> Optional[List[types.Utterance]]: + "The unredacted list of utterances, when `redact_pii_return_unredacted` was enabled." + if not self._impl.transcript: + raise ValueError("The internal Transcript object is None.") + + return self._impl.transcript.unredacted_utterances + @property def confidence(self) -> Optional[float]: "The confidence our model has in the transcribed text, between 0 and 1" diff --git a/assemblyai/types.py b/assemblyai/types.py index 9d6201c..f39beff 100644 --- a/assemblyai/types.py +++ b/assemblyai/types.py @@ -876,6 +876,8 @@ class RawTranscriptionConfig(BaseModel): "The list of PII Redaction policies to enable." redact_pii_sub: Optional[PIISubstitutionPolicy] = None "The replacement logic for detected PII." + redact_pii_return_unredacted: Optional[bool] = None + "If `redact_pii` is enabled, also return the unredacted text/words/utterances alongside the redacted fields." speaker_labels: Optional[bool] = None "Enable Speaker Diarization." @@ -1008,6 +1010,7 @@ def __init__( redact_pii_audio_options: Optional[RedactPiiAudioOptions] = None, redact_pii_policies: Optional[List[PIIRedactionPolicy]] = None, redact_pii_sub: Optional[PIISubstitutionPolicy] = None, + redact_pii_return_unredacted: Optional[bool] = None, speaker_labels: Optional[bool] = None, speakers_expected: Optional[int] = None, speaker_options: Optional[SpeakerOptions] = None, @@ -1060,6 +1063,7 @@ def __init__( redact_pii_audio_options: Options for controlling PII audio redaction behavior (e.g., override the redaction method to silence). redact_pii_policies: The list of PII Redaction policies to enable. redact_pii_sub: The replacement logic for detected PII. + redact_pii_return_unredacted: If `redact_pii` is enabled, also return the unredacted text/words/utterances on the transcript response. Requires `redact_pii=True`. speaker_labels: Enable Speaker Diarization. speakers_expected: The number of speakers you expect to hear in your audio file. Up to 10 speakers are supported. speaker_options: Advanced options for controlling speaker diarization parameters, including min and max speakers expected. @@ -1117,6 +1121,7 @@ def __init__( redact_pii_audio_options, redact_pii_policies, redact_pii_sub, + redact_pii_return_unredacted, ) self.set_speaker_diarization(speaker_labels, speakers_expected, speaker_options) self.set_content_safety(content_safety, content_safety_confidence) @@ -1397,6 +1402,12 @@ def redact_pii_sub(self) -> Optional[PIISubstitutionPolicy]: return self._raw_transcription_config.redact_pii_sub + @property + def redact_pii_return_unredacted(self) -> Optional[bool]: + "Returns whether the unredacted text/words/utterances should also be returned alongside redacted fields." + + return self._raw_transcription_config.redact_pii_return_unredacted + @property def speaker_labels(self) -> Optional[bool]: "Returns the status of the Speaker Diarization feature." @@ -1797,6 +1808,7 @@ def set_redact_pii( redact_audio_options: Optional[RedactPiiAudioOptions] = None, policies: Optional[List[PIIRedactionPolicy]] = None, substitution: Optional[PIISubstitutionPolicy] = None, + return_unredacted: Optional[bool] = None, ) -> Self: """ Enables Personal Identifiable Information (PII) Redaction feature. @@ -1808,6 +1820,7 @@ def set_redact_pii( redact_audio_options: Options for controlling PII audio redaction behavior (e.g., override the redaction method to silence). policies: A list of PII redaction policies to enable. substitution: The replacement logic for detected PII (`PIISubstutionPolicy.hash` by default). + return_unredacted: Also return the unredacted text/words/utterances on the transcript response. Only valid when redaction is enabled. """ if not enable: @@ -1817,6 +1830,7 @@ def set_redact_pii( self._raw_transcription_config.redact_pii_audio_options = None self._raw_transcription_config.redact_pii_policies = None self._raw_transcription_config.redact_pii_sub = None + self._raw_transcription_config.redact_pii_return_unredacted = None return self @@ -1829,6 +1843,7 @@ def set_redact_pii( self._raw_transcription_config.redact_pii_audio_options = redact_audio_options self._raw_transcription_config.redact_pii_policies = policies self._raw_transcription_config.redact_pii_sub = substitution + self._raw_transcription_config.redact_pii_return_unredacted = return_unredacted return self @@ -2280,6 +2295,8 @@ class BaseTranscript(BaseModel): "The list of PII Redaction policies to enable." redact_pii_sub: Optional[PIISubstitutionPolicy] = None "The replacement logic for detected PII." + redact_pii_return_unredacted: Optional[bool] = None + "If `redact_pii` is enabled, also return the unredacted text/words/utterances alongside the redacted fields." speaker_labels: Optional[bool] = None "Enable Speaker Diarization." @@ -2424,6 +2441,15 @@ class TranscriptResponse(BaseTranscript): utterances: Optional[List[Utterance]] = None "When `dual_channel`, `multichannel`, or `speaker_labels` is enabled, a list of turn-by-turn utterances" + unredacted_text: Optional[str] = None + "The unredacted transcript text. Returned only when `redact_pii_return_unredacted` was set with `redact_pii`." + + unredacted_words: Optional[List[Word]] = None + "The unredacted list of individual words. Returned only when `redact_pii_return_unredacted` was set with `redact_pii`." + + unredacted_utterances: Optional[List[Utterance]] = None + "The unredacted list of utterances. Returned only when `redact_pii_return_unredacted` was set with `redact_pii` and channel/speaker modes are enabled." + confidence: Optional[float] = None "The confidence our model has in the transcribed text, between 0.0 and 1.0" diff --git a/tests/unit/test_redact_pii.py b/tests/unit/test_redact_pii.py index 8a7bae7..5a73e2c 100644 --- a/tests/unit/test_redact_pii.py +++ b/tests/unit/test_redact_pii.py @@ -198,6 +198,110 @@ def test_redact_pii_params_excluded_when_disabled(httpx_mock: HTTPXMock): assert request_body.get("redact_pii_sub") is None +def test_redact_pii_return_unredacted_default_absent(httpx_mock: HTTPXMock): + """ + Tests that `redact_pii_return_unredacted` is absent from the request body + when not explicitly set, even when `redact_pii` is enabled. + """ + request_body, _ = unit_test_utils.submit_mock_transcription_request( + httpx_mock, + mock_response=factories.generate_dict_factory( + TranscriptWithPIIRedactionResponseFactory + )(), + config=aai.TranscriptionConfig( + redact_pii=True, + redact_pii_policies=[aai.types.PIIRedactionPolicy.date], + ), + ) + + assert request_body.get("redact_pii_return_unredacted") is None + + +def test_redact_pii_return_unredacted_in_request(httpx_mock: HTTPXMock): + """ + Tests that setting `return_unredacted=True` on `set_redact_pii` puts + `redact_pii_return_unredacted=True` in the submission request body. + """ + config = aai.TranscriptionConfig().set_redact_pii( + policies=[aai.types.PIIRedactionPolicy.date], + return_unredacted=True, + ) + + request_body, _ = unit_test_utils.submit_mock_transcription_request( + httpx_mock, + mock_response=factories.generate_dict_factory( + TranscriptWithPIIRedactionResponseFactory + )(), + config=config, + ) + + assert request_body.get("redact_pii") is True + assert request_body.get("redact_pii_return_unredacted") is True + + +def test_redact_pii_return_unredacted_cleared_on_disable(): + """ + Tests that calling `set_redact_pii(enable=False)` after enabling with + `return_unredacted=True` clears the flag from the raw config. + """ + config = aai.TranscriptionConfig().set_redact_pii( + policies=[aai.types.PIIRedactionPolicy.date], + return_unredacted=True, + ) + assert config.redact_pii_return_unredacted is True + + config.set_redact_pii(enable=False) + assert config.redact_pii_return_unredacted is None + + +def test_unredacted_response_fields_surface_on_transcript(httpx_mock: HTTPXMock): + """ + Tests that `unredacted_text`, `unredacted_words`, and `unredacted_utterances` + in the API response are accessible via the `Transcript` wrapper. + """ + base_response = factories.generate_dict_factory( + TranscriptWithPIIRedactionResponseFactory + )() + unredacted_words = [{"text": "hello", "start": 0, "end": 500, "confidence": 0.99}] + unredacted_utterances = [ + { + "text": "hello world", + "start": 0, + "end": 1000, + "confidence": 0.99, + "speaker": "A", + "words": [ + {"text": "hello", "start": 0, "end": 500, "confidence": 0.99}, + {"text": "world", "start": 500, "end": 1000, "confidence": 0.99}, + ], + } + ] + mock_response = { + **base_response, + "redact_pii_return_unredacted": True, + "unredacted_text": "hello world", + "unredacted_words": unredacted_words, + "unredacted_utterances": unredacted_utterances, + } + + _, transcript = unit_test_utils.submit_mock_transcription_request( + httpx_mock, + mock_response=mock_response, + config=aai.TranscriptionConfig().set_redact_pii( + policies=[aai.types.PIIRedactionPolicy.date], + return_unredacted=True, + ), + ) + + assert transcript.unredacted_text == "hello world" + assert transcript.unredacted_words is not None + assert len(transcript.unredacted_words) == 1 + assert transcript.unredacted_words[0].text == "hello" + assert transcript.unredacted_utterances is not None + assert len(transcript.unredacted_utterances) == 1 + assert transcript.unredacted_utterances[0].text == "hello world" + + def __get_redacted_audio_api_url(transcript: aai.Transcript) -> str: return ( f"{aai.settings.base_url}{ENDPOINT_TRANSCRIPT}/{transcript.id}/redacted-audio"