Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion assemblyai/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.62.0"
__version__ = "0.63.0"
24 changes: 24 additions & 0 deletions assemblyai/transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,30 @@ def utterances(self) -> Optional[List[types.Utterance]]:

return self._impl.transcript.utterances

@property
def unredacted_text(self) -> Optional[str]:
"The unredacted transcript text, when `redact_pii_return_unredacted` was enabled."
if not self._impl.transcript:
raise ValueError("The internal Transcript object is None.")

return self._impl.transcript.unredacted_text

@property
def unredacted_words(self) -> Optional[List[types.Word]]:
"The unredacted list of words, when `redact_pii_return_unredacted` was enabled."
if not self._impl.transcript:
raise ValueError("The internal Transcript object is None.")

return self._impl.transcript.unredacted_words

@property
def unredacted_utterances(self) -> Optional[List[types.Utterance]]:
"The unredacted list of utterances, when `redact_pii_return_unredacted` was enabled."
if not self._impl.transcript:
raise ValueError("The internal Transcript object is None.")

return self._impl.transcript.unredacted_utterances

@property
def confidence(self) -> Optional[float]:
"The confidence our model has in the transcribed text, between 0 and 1"
Expand Down
26 changes: 26 additions & 0 deletions assemblyai/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -876,6 +876,8 @@ class RawTranscriptionConfig(BaseModel):
"The list of PII Redaction policies to enable."
redact_pii_sub: Optional[PIISubstitutionPolicy] = None
"The replacement logic for detected PII."
redact_pii_return_unredacted: Optional[bool] = None
"If `redact_pii` is enabled, also return the unredacted text/words/utterances alongside the redacted fields."

speaker_labels: Optional[bool] = None
"Enable Speaker Diarization."
Expand Down Expand Up @@ -1008,6 +1010,7 @@ def __init__(
redact_pii_audio_options: Optional[RedactPiiAudioOptions] = None,
redact_pii_policies: Optional[List[PIIRedactionPolicy]] = None,
redact_pii_sub: Optional[PIISubstitutionPolicy] = None,
redact_pii_return_unredacted: Optional[bool] = None,
speaker_labels: Optional[bool] = None,
speakers_expected: Optional[int] = None,
speaker_options: Optional[SpeakerOptions] = None,
Expand Down Expand Up @@ -1060,6 +1063,7 @@ def __init__(
redact_pii_audio_options: Options for controlling PII audio redaction behavior (e.g., override the redaction method to silence).
redact_pii_policies: The list of PII Redaction policies to enable.
redact_pii_sub: The replacement logic for detected PII.
redact_pii_return_unredacted: If `redact_pii` is enabled, also return the unredacted text/words/utterances on the transcript response. Requires `redact_pii=True`.
speaker_labels: Enable Speaker Diarization.
speakers_expected: The number of speakers you expect to hear in your audio file. Up to 10 speakers are supported.
speaker_options: Advanced options for controlling speaker diarization parameters, including min and max speakers expected.
Expand Down Expand Up @@ -1117,6 +1121,7 @@ def __init__(
redact_pii_audio_options,
redact_pii_policies,
redact_pii_sub,
redact_pii_return_unredacted,
)
self.set_speaker_diarization(speaker_labels, speakers_expected, speaker_options)
self.set_content_safety(content_safety, content_safety_confidence)
Expand Down Expand Up @@ -1397,6 +1402,12 @@ def redact_pii_sub(self) -> Optional[PIISubstitutionPolicy]:

return self._raw_transcription_config.redact_pii_sub

@property
def redact_pii_return_unredacted(self) -> Optional[bool]:
"Returns whether the unredacted text/words/utterances should also be returned alongside redacted fields."

return self._raw_transcription_config.redact_pii_return_unredacted

@property
def speaker_labels(self) -> Optional[bool]:
"Returns the status of the Speaker Diarization feature."
Expand Down Expand Up @@ -1797,6 +1808,7 @@ def set_redact_pii(
redact_audio_options: Optional[RedactPiiAudioOptions] = None,
policies: Optional[List[PIIRedactionPolicy]] = None,
substitution: Optional[PIISubstitutionPolicy] = None,
return_unredacted: Optional[bool] = None,
) -> Self:
"""
Enables Personal Identifiable Information (PII) Redaction feature.
Expand All @@ -1808,6 +1820,7 @@ def set_redact_pii(
redact_audio_options: Options for controlling PII audio redaction behavior (e.g., override the redaction method to silence).
policies: A list of PII redaction policies to enable.
substitution: The replacement logic for detected PII (`PIISubstutionPolicy.hash` by default).
return_unredacted: Also return the unredacted text/words/utterances on the transcript response. Only valid when redaction is enabled.
"""

if not enable:
Expand All @@ -1817,6 +1830,7 @@ def set_redact_pii(
self._raw_transcription_config.redact_pii_audio_options = None
self._raw_transcription_config.redact_pii_policies = None
self._raw_transcription_config.redact_pii_sub = None
self._raw_transcription_config.redact_pii_return_unredacted = None

return self

Expand All @@ -1829,6 +1843,7 @@ def set_redact_pii(
self._raw_transcription_config.redact_pii_audio_options = redact_audio_options
self._raw_transcription_config.redact_pii_policies = policies
self._raw_transcription_config.redact_pii_sub = substitution
self._raw_transcription_config.redact_pii_return_unredacted = return_unredacted

return self

Expand Down Expand Up @@ -2280,6 +2295,8 @@ class BaseTranscript(BaseModel):
"The list of PII Redaction policies to enable."
redact_pii_sub: Optional[PIISubstitutionPolicy] = None
"The replacement logic for detected PII."
redact_pii_return_unredacted: Optional[bool] = None
"If `redact_pii` is enabled, also return the unredacted text/words/utterances alongside the redacted fields."

speaker_labels: Optional[bool] = None
"Enable Speaker Diarization."
Expand Down Expand Up @@ -2424,6 +2441,15 @@ class TranscriptResponse(BaseTranscript):
utterances: Optional[List[Utterance]] = None
"When `dual_channel`, `multichannel`, or `speaker_labels` is enabled, a list of turn-by-turn utterances"

unredacted_text: Optional[str] = None
"The unredacted transcript text. Returned only when `redact_pii_return_unredacted` was set with `redact_pii`."

unredacted_words: Optional[List[Word]] = None
"The unredacted list of individual words. Returned only when `redact_pii_return_unredacted` was set with `redact_pii`."

unredacted_utterances: Optional[List[Utterance]] = None
"The unredacted list of utterances. Returned only when `redact_pii_return_unredacted` was set with `redact_pii` and channel/speaker modes are enabled."

confidence: Optional[float] = None
"The confidence our model has in the transcribed text, between 0.0 and 1.0"

Expand Down
104 changes: 104 additions & 0 deletions tests/unit/test_redact_pii.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,110 @@ def test_redact_pii_params_excluded_when_disabled(httpx_mock: HTTPXMock):
assert request_body.get("redact_pii_sub") is None


def test_redact_pii_return_unredacted_default_absent(httpx_mock: HTTPXMock):
"""
Tests that `redact_pii_return_unredacted` is absent from the request body
when not explicitly set, even when `redact_pii` is enabled.
"""
request_body, _ = unit_test_utils.submit_mock_transcription_request(
httpx_mock,
mock_response=factories.generate_dict_factory(
TranscriptWithPIIRedactionResponseFactory
)(),
config=aai.TranscriptionConfig(
redact_pii=True,
redact_pii_policies=[aai.types.PIIRedactionPolicy.date],
),
)

assert request_body.get("redact_pii_return_unredacted") is None


def test_redact_pii_return_unredacted_in_request(httpx_mock: HTTPXMock):
"""
Tests that setting `return_unredacted=True` on `set_redact_pii` puts
`redact_pii_return_unredacted=True` in the submission request body.
"""
config = aai.TranscriptionConfig().set_redact_pii(
policies=[aai.types.PIIRedactionPolicy.date],
return_unredacted=True,
)

request_body, _ = unit_test_utils.submit_mock_transcription_request(
httpx_mock,
mock_response=factories.generate_dict_factory(
TranscriptWithPIIRedactionResponseFactory
)(),
config=config,
)

assert request_body.get("redact_pii") is True
assert request_body.get("redact_pii_return_unredacted") is True


def test_redact_pii_return_unredacted_cleared_on_disable():
"""
Tests that calling `set_redact_pii(enable=False)` after enabling with
`return_unredacted=True` clears the flag from the raw config.
"""
config = aai.TranscriptionConfig().set_redact_pii(
policies=[aai.types.PIIRedactionPolicy.date],
return_unredacted=True,
)
assert config.redact_pii_return_unredacted is True

config.set_redact_pii(enable=False)
assert config.redact_pii_return_unredacted is None


def test_unredacted_response_fields_surface_on_transcript(httpx_mock: HTTPXMock):
"""
Tests that `unredacted_text`, `unredacted_words`, and `unredacted_utterances`
in the API response are accessible via the `Transcript` wrapper.
"""
base_response = factories.generate_dict_factory(
TranscriptWithPIIRedactionResponseFactory
)()
unredacted_words = [{"text": "hello", "start": 0, "end": 500, "confidence": 0.99}]
unredacted_utterances = [
{
"text": "hello world",
"start": 0,
"end": 1000,
"confidence": 0.99,
"speaker": "A",
"words": [
{"text": "hello", "start": 0, "end": 500, "confidence": 0.99},
{"text": "world", "start": 500, "end": 1000, "confidence": 0.99},
],
}
]
mock_response = {
**base_response,
"redact_pii_return_unredacted": True,
"unredacted_text": "hello world",
"unredacted_words": unredacted_words,
"unredacted_utterances": unredacted_utterances,
}

_, transcript = unit_test_utils.submit_mock_transcription_request(
httpx_mock,
mock_response=mock_response,
config=aai.TranscriptionConfig().set_redact_pii(
policies=[aai.types.PIIRedactionPolicy.date],
return_unredacted=True,
),
)

assert transcript.unredacted_text == "hello world"
assert transcript.unredacted_words is not None
assert len(transcript.unredacted_words) == 1
assert transcript.unredacted_words[0].text == "hello"
assert transcript.unredacted_utterances is not None
assert len(transcript.unredacted_utterances) == 1
assert transcript.unredacted_utterances[0].text == "hello world"


def __get_redacted_audio_api_url(transcript: aai.Transcript) -> str:
return (
f"{aai.settings.base_url}{ENDPOINT_TRANSCRIPT}/{transcript.id}/redacted-audio"
Expand Down
Loading