diff --git a/encexp/__init__.py b/encexp/__init__.py index 7bfdcf6..542790e 100644 --- a/encexp/__init__.py +++ b/encexp/__init__.py @@ -17,4 +17,4 @@ if not '-m' in sys.argv: from encexp.text_repr import EncExpT, SeqTM, TextModel -__version__ = "0.1.5" +__version__ = "0.1.6" diff --git a/encexp/download.py b/encexp/download.py index b2cec62..893ed95 100644 --- a/encexp/download.py +++ b/encexp/download.py @@ -11,20 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import argparse -from encexp.utils import Download, MODELS, EncExp_URL -from microtc.utils import tweet_iterator -from os.path import isdir, isfile, join -import numpy as np import os -import encexp +from os.path import isdir, isfile, join +from microtc.utils import tweet_iterator +from encexp.utils import Download, MODELS, EncExp_URL -def download(identifier: str, first: bool=True, base_url: str=EncExp_URL): +def download(identifier: str, first: bool=True, + base_url: str=EncExp_URL, + outputdir: str=MODELS): """download""" - if not isdir(MODELS): - os.mkdir(MODELS) - output = join(MODELS, f'{identifier}.json.gz') + if not isdir(outputdir): + os.mkdir(outputdir) + output = join(outputdir, f'{identifier}.json.gz') if isfile(output): try: if first: diff --git a/encexp/utils.py b/encexp/utils.py index 77a621e..fa75bf1 100644 --- a/encexp/utils.py +++ b/encexp/utils.py @@ -31,8 +31,7 @@ DialectID_URL = 'https://github.com/INGEOTEC/dialectid/releases/download/data' EncExp_URL = 'https://github.com/INGEOTEC/EncExp/releases/download/data' -MODELS = join(dirname(__file__), - 'models') +MODELS = join(dirname(__file__), 'models') class Download(object): """Download @@ -70,7 +69,7 @@ def tqdm(self): self.tqdm = tqdm(total=self._nblocks, leave=False, desc=self._output) return self._tqdm - + @tqdm.setter def tqdm(self, value): self._tqdm = value diff --git a/quarto/EncExp.qmd b/quarto/EncExp.qmd index 75ce251..b46788a 100644 --- a/quarto/EncExp.qmd +++ b/quarto/EncExp.qmd @@ -104,7 +104,7 @@ score.plot() ``` ::: -# Corpus +# Corpora ## Column {.tabset} @@ -280,14 +280,12 @@ Markdown(dataset_info('zh').to_markdown(index=False)) ## Column ::: {.card title="Description"} -The dataset used to create the self-supervised problems is a collection of Tweets collected from the open stream for several years, i.e., the Spanish collection started on December 11, 2015; English on July 1, 2016; Arabic on January 25, 2017; Russian on October 16, 2018; and the rest of the languages on June 1, 2021. In all the cases, the last day collected was June 9, 2023. The collected Tweets were filtered with the following restrictions: the retweets were removed; URL and users were replaced by the tokens _url and _usr, respectively; and only tweets with at least 50 characters were accepted in the final collection. +The dataset used to create the self-supervised problems is a collection of Tweets collected from the open stream for several years, i.e., the Spanish collection started on December 11, 2015; English on July 1, 2016; Arabic on January 25, 2017; Russian on October 16, 2018; and the rest of the languages on June 1, 2021. In all the cases, the last day collected was June 9, 2023. The collected Tweets were filtered with the following restrictions: retweets were removed; URLs and usernames were replaced by the tokens _url and _usr, respectively; and only tweets with at least 50 characters were included in the final collection. -The Corpus is divided into two sets: the first set is used as a training set, i.e., to estimate the parameters, while the second set corresponds to the test set, which could be used to measure the model performance. The basis for this division is a specific date, with tweets published prior to October 1, 2022, forming the first set and those published on October 3, 2022, or later, being used to create the test set. +The corpora are divided into two sets: the first set is used as a training set, i.e., to estimate the parameters, while the second set corresponds to the test set, which could be used to measure the model's performance. The basis for this division is a specific date, with tweets published before October 1, 2022, forming the first set. Those published on or after October 3, 2022, are being used to create the test set. -The training set and test set were created with an equivalent procedure; the only difference is that the maximum size of the training set is $2^{23}$ (8M) tweets and $2^{12}$ (4096) tweets for the test set. +The training set and test set were created using an equivalent procedure; the only difference is that the maximum size of the training set is $2^{23}$ (8 million tweets), and the test set is $2^{12}$ (4,096 tweets). -The training and test set was meticulously crafted by uniformly selecting the maximum number of tweets (i.e., $2^{23}$ and $2^{12}$, respectively) from each day collected. These selected tweets were then organized by day, and within each day, the tweets were randomly chosen, with near duplicates being removed. The subsequent step involved the elimination of tweets that were near duplicates of the previous three days. - -It is worth mentioning that the last step is to shuffle the training and test set to eliminate the ordering by date. +There are pairs of training and test sets for each country, using tweets with geographic information, and a pair that groups all tweets without geographic information, labeled as ALL. Each set was meticulously crafted to have, as closely as possible, a uniform distribution of the days. Within each day, near duplicates were removed. Then, a three-day sliding window was used to remove near duplicates within the window. The final step was to shuffle the data to remove the ordering by date. :::