Skip to content

Commit 9ba9009

Browse files
committed
load a url
1 parent 8d483c4 commit 9ba9009

File tree

4 files changed

+29
-10
lines changed

4 files changed

+29
-10
lines changed

batchalign/cli/cli.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,9 @@ def writer(doc, output):
154154
default=False, help="Perform speaker diarization (this flag is ignored with Rev.AI)")
155155
@click.option("--wor/--nowor",
156156
default=False, help="Should we write word level alignment line? Default to no.")
157+
@click.option("--data",
158+
help="the URL of the data",
159+
type=str)
157160
@click.option("--lang",
158161
help="sample language in three-letter ISO 3166-1 alpha-3 code",
159162
show_default=True,

batchalign/cli/dispatch.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"""
66

77
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn, BarColumn
8+
from urllib.parse import urlparse
89

910
import warnings
1011

@@ -63,6 +64,15 @@ def _dispatch(command, lang, num_speakers,
6364
files = []
6465
outputs = []
6566

67+
if kwargs.get("data"):
68+
url = kwargs.get("data")
69+
url = urlparse(url)
70+
if url.scheme == "":
71+
url = url._replace(scheme="http")
72+
base = os.path.basename(url.path)
73+
files.append(url)
74+
outputs.append(os.path.join(out_dir, base))
75+
6676
for basedir, _, fs in os.walk(in_dir):
6777
for f in fs:
6878
path = Path(os.path.join(basedir, f))
@@ -128,7 +138,8 @@ def _dispatch(command, lang, num_speakers,
128138
errors = []
129139
# create the spinner bars
130140
for f in files:
131-
tasks[f] = prog.add_task(Path(f).name, start=False, processor="")
141+
tasks[f] = prog.add_task(Path(f).name if isinstance(f, str) else Path(f.geturl()).name,
142+
start=False, processor="")
132143

133144
# create pipeline and read files
134145
baL.debug("Attempting to create BatchalignPipeline for CLI...")
@@ -152,7 +163,7 @@ def progress_callback(file, step, total, tools):
152163
prog.start_task(tasks[file])
153164
with warnings.catch_warnings(record=True) as w:
154165
# parse the input format, as needed
155-
doc = loader(os.path.abspath(file))
166+
doc = loader(os.path.abspath(file) if isinstance(file, str) else file.geturl())
156167
# if we ended up with a tuple of length two,
157168
# that means that the loader requested kwargs
158169
kw = {}
@@ -179,7 +190,7 @@ def progress_callback(file, step, total, tools):
179190
if len(errors) > 0:
180191
C.print()
181192
for file, trcbk, e in errors:
182-
C.print(f"[bold red]ERROR[/bold red] on file [italic]{os.path.relpath(str(Path(file).absolute()), in_dir)}[/italic]: {escape(str(e))}\n")
193+
C.print(f"[bold red]ERROR[/bold red] on file [italic]{os.path.relpath(str(Path(file).absolute()), in_dir) if isinstance(file, str) else file.geturl()}[/italic]: {escape(str(e))}\n")
183194
if ctx.obj["verbose"] == 1:
184195
C.print(escape(str(trcbk)))
185196
elif ctx.obj["verbose"] > 1:

batchalign/pipelines/asr/tencent.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,16 +74,21 @@ def generate(self, f, **kwargs):
7474

7575
L.info(f"Uploading '{pathlib.Path(f).stem}'...")
7676
# we will send the file for processing
77-
with open(f, "rb") as image_file:
78-
encoded_string = base64.b64encode(image_file.read())
77+
if not str(f).startswith("http"):
78+
with open(f, "rb") as image_file:
79+
encoded_string = base64.b64encode(image_file.read())
7980

8081
req = models.CreateRecTaskRequest()
8182
req.EngineModelType = f"16k_{lang}"
8283
req.ResTextFormat = 1
83-
req.SourceType = 1
8484
req.SpeakerDiarization = 1
8585
req.ChannelNum = 1
86-
req.Data = encoded_string.decode('ascii')
86+
if not str(f).startswith("http"):
87+
req.Data = encoded_string.decode('ascii')
88+
req.SourceType = 1
89+
else:
90+
req.Url = f
91+
req.SourceType = 0
8792
resp = client.CreateRecTask(req)
8893

8994
L.info(f"Tencent is transcribing '{pathlib.Path(f).stem}'...")
@@ -96,7 +101,7 @@ def generate(self, f, **kwargs):
96101
res = client.DescribeTaskStatus(req)
97102

98103
# if failed, raise
99-
if res.Data.Status == "3":
104+
if res.Data.Status == "3" or res.Data.Status == 3:
100105
raise RuntimeError(f"Tencent reports job failed! error='{res.Data.ErrorMsg}'")
101106

102107
turns = []

batchalign/version

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
0.7.17-post.15
1+
0.7.17-post.16
22
March 26th, 2025
3-
better coref model
3+
better tencent ASR

0 commit comments

Comments
 (0)