{"id":551,"date":"2024-04-06T21:56:38","date_gmt":"2024-04-06T13:56:38","guid":{"rendered":"http:\/\/tobykskgd.life\/?p=551"},"modified":"2024-11-14T22:21:01","modified_gmt":"2024-11-14T14:21:01","slug":"23","status":"publish","type":"post","link":"https:\/\/tobykskgd.life\/index.php\/23\/","title":{"rendered":"\u674e\u5b8f\u6bc5\u673a\u5668\u5b66\u4e60\u8bfe\u7a0b\u7b14\u8bb0EP15"},"content":{"rendered":"\n<p class=\"wp-block-paragraph\">\u3010HW5\u3011Transformer0.0\u674e\u5b8f\u6bc52021\/2022\u6625\u673a\u5668\u5b66\u4e60\u8bfe\u7a0b\u7b14\u8bb0EP15(P54-P57)<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><div class='fancybox-wrapper lazyload-container-unload' data-fancybox='post-images' href='https:\/\/tobykskgd.life\/wp-content\/uploads\/2024\/02\/\u5c4f\u5e55\u622a\u56fe-2024-02-05-213355.png'><img class=\"lazyload lazyload-style-1\" src=\"data:image\/svg+xml;base64,PCEtLUFyZ29uTG9hZGluZy0tPgo8c3ZnIHdpZHRoPSIxIiBoZWlnaHQ9IjEiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgc3Ryb2tlPSIjZmZmZmZmMDAiPjxnPjwvZz4KPC9zdmc+\"  loading=\"lazy\" decoding=\"async\" width=\"432\" height=\"218\" data-original=\"https:\/\/tobykskgd.life\/wp-content\/uploads\/2024\/02\/\u5c4f\u5e55\u622a\u56fe-2024-02-05-213355.png\" src=\"data:image\/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsQAAA7EAZUrDhsAAAANSURBVBhXYzh8+PB\/AAffA0nNPuCLAAAAAElFTkSuQmCC\" alt=\"\" class=\"wp-image-37\"  sizes=\"auto, (max-width: 432px) 100vw, 432px\" \/><\/div><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">\u4ece\u4eca\u5929\u5f00\u59cb\u6211\u5c06\u5b66\u4e60\u674e\u5b8f\u6bc5\u6559\u6388\u7684\u673a\u5668\u5b66\u4e60\u89c6\u9891\uff0c\u4e0b\u9762\u662f\u8bfe\u7a0b\u7684\u8fde\u63a5<a href=\"https:\/\/www.bilibili.com\/video\/BV1Wv411h7kN\/?spm_id_from=333.337.search-card.all.click&amp;vd_source=fa9de75b9e5251495ee15fc767cb5892\">(\u5f3a\u63a8)\u674e\u5b8f\u6bc52021\/2022\u6625\u673a\u5668\u5b66\u4e60\u8bfe\u7a0b_\u54d4\u54e9\u54d4\u54e9_bilibili<\/a>\u3002\u4e00\u5171\u6709155\u4e2a\u89c6\u9891\uff0c\u4e89\u53d6\u90fd\u5b66\u4e60\u5b8c\u6210\u5427\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">\u90a3\u4e48\u9996\u5148\u8fd9\u95e8\u8bfe\u7a0b\u9700\u8981\u6709\u4e00\u5b9a\u7684\u4ee3\u7801\u57fa\u7840\uff0c\u7b80\u5355\u5b66\u4e60\u4e00\u4e0bPython\u7684\u57fa\u672c\u7528\u6cd5\uff0c\u8fd8\u6709\u91cc\u9762\u7684NumPy\u5e93\u7b49\u7b49\u7684\u57fa\u672c\u77e5\u8bc6\u3002\u518d\u5c31\u662f\u6570\u5b66\u65b9\u9762\u7684\u57fa\u7840\u5566\uff0c\u5fae\u79ef\u5206\u3001\u7ebf\u6027\u4ee3\u6570\u548c\u6982\u7387\u8bba\u7684\u57fa\u7840\u90fd\u662f\u542c\u61c2\u8fd9\u95e8\u8bfe\u5fc5\u987b\u7684\u3002<\/p>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<p class=\"wp-block-paragraph\">u1s1\uff0c\u4f5c\u4e1a\u4e94\u7684\u4ee3\u7801\u4e00\u76f4\u6ca1\u6709\u641e\u61c2\uff0c\u8fd9\u91cc\u5148\u76f4\u63a5\u653e\u4e00\u4e2a\u52a9\u6559\u7684.ipynb\u3002<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Download and import required packages<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>!pip install 'torch&gt;=1.6.0' editdistance matplotlib sacrebleu sacremoses sentencepiece tqdm wandb\n!pip install --upgrade jupyter ipywidgets<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>!git clone https:\/\/github.com\/pytorch\/fairseq.git\n!cd fairseq &amp;&amp; git checkout 9a1c497\n!pip install --upgrade .\/fairseq\/<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>import sys\nimport pdb\nimport pprint\nimport logging\nimport os\nimport random\n\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nfrom torch.utils import data\nimport numpy as np\nimport tqdm.auto as tqdm\nfrom pathlib import Path\nfrom argparse import Namespace\nfrom fairseq import utils\n\nimport matplotlib.pyplot as plt<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Fix random seed<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>seed = 33\nrandom.seed(seed)\ntorch.manual_seed(seed)\nif torch.cuda.is_available():\n    torch.cuda.manual_seed(seed)\n    torch.cuda.manual_seed_all(seed)  \nnp.random.seed(seed)  \ntorch.backends.cudnn.benchmark = False\ntorch.backends.cudnn.deterministic = True<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Dataset<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong> En-Zh Bilingual Parallel Corpus<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"> TED2020<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8211; Raw: 400,726 (sentences) &nbsp;<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8211; Processed: 394,052 (sentences)<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong> Testdata<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; Size: 4,000 (sentences)<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; <strong>**Chinese translation is undisclosed. The provided (.zh) file is psuedo translation, each line is a &#8216;\u3002&#8217;**<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Dataset Download<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>data_dir = '.\/DATA\/rawdata'\ndataset_name = 'ted2020'\nurls = (\n    \"https:\/\/github.com\/figisiwirf\/ml2023-hw5-dataset\/releases\/download\/v1.0.1\/ml2023.hw5.data.tgz\",\n    \"https:\/\/github.com\/figisiwirf\/ml2023-hw5-dataset\/releases\/download\/v1.0.1\/ml2023.hw5.test.tgz\"\n)\nfile_names = (\n    'ted2020.tgz', # train &amp; dev\n    'test.tgz', # test\n)\nprefix = Path(data_dir).absolute() \/ dataset_name\n\nprefix.mkdir(parents=True, exist_ok=True)\nfor u, f in zip(urls, file_names):\n    path = prefix\/f\n    if not path.exists():\n        !wget {u} -O {path}\n    if path.suffix == \".tgz\":\n        !tar -xvf {path} -C {prefix}\n    elif path.suffix == \".zip\":\n        !unzip -o {path} -d {prefix}\n!mv {prefix\/'raw.en'} {prefix\/'train_dev.raw.en'}\n!mv {prefix\/'raw.zh'} {prefix\/'train_dev.raw.zh'}\n!mv {prefix\/'test.en'} {prefix\/'test.raw.en'}\n!mv {prefix\/'test.zh'} {prefix\/'test.raw.zh'}<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Language<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>src_lang = 'en'\ntgt_lang = 'zh'\n\ndata_prefix = f'{prefix}\/train_dev.raw'\ntest_prefix = f'{prefix}\/test.raw'<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>!head {data_prefix+'.'+src_lang} -n 5\n!head {data_prefix+'.'+tgt_lang} -n 5<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Preprocess files<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>import re\n\ndef strQ2B(ustring):\n    \"\"\"Full width -&gt; half width\"\"\"\n    # reference:https:\/\/ithelp.ithome.com.tw\/articles\/10233122\n    ss = &#91;]\n    for s in ustring:\n        rstring = \"\"\n        for uchar in s:\n            inside_code = ord(uchar)\n            if inside_code == 12288:  # Full width space: direct conversion\n                inside_code = 32\n            elif (inside_code &gt;= 65281 and inside_code &lt;= 65374):  # Full width chars (except space) conversion\n                inside_code -= 65248\n            rstring += chr(inside_code)\n        ss.append(rstring)\n    return ''.join(ss)\n                \ndef clean_s(s, lang):\n    if lang == 'en':\n        s = re.sub(r\"\\(&#91;^()]*\\)\", \"\", s) # remove (&#91;text])\n        s = s.replace('-', '') # remove '-'\n        s = re.sub('(&#91;.,;!?()\\\"])', r' \\1 ', s) # keep punctuation\n    elif lang == 'zh':\n        s = strQ2B(s) # Q2B\n        s = re.sub(r\"\\(&#91;^()]*\\)\", \"\", s) # remove (&#91;text])\n        s = s.replace(' ', '')\n        s = s.replace('\u2014', '')\n        s = s.replace('\u201c', '\"')\n        s = s.replace('\u201d', '\"')\n        s = s.replace('_', '')\n        s = re.sub('(&#91;\u3002,;!?()\\\"~\u300c\u300d])', r' \\1 ', s) # keep punctuation\n    s = ' '.join(s.strip().split())\n    return s\n\ndef len_s(s, lang):\n    if lang == 'zh':\n        return len(s)\n    return len(s.split())\n\ndef clean_corpus(prefix, l1, l2, ratio=9, max_len=1000, min_len=1):\n    if Path(f'{prefix}.clean.{l1}').exists() and Path(f'{prefix}.clean.{l2}').exists():\n        print(f'{prefix}.clean.{l1} &amp; {l2} exists. skipping clean.')\n        return\n    with open(f'{prefix}.{l1}', 'r') as l1_in_f:\n        with open(f'{prefix}.{l2}', 'r') as l2_in_f:\n            with open(f'{prefix}.clean.{l1}', 'w') as l1_out_f:\n                with open(f'{prefix}.clean.{l2}', 'w') as l2_out_f:\n                    for s1 in l1_in_f:\n                        s1 = s1.strip()\n                        s2 = l2_in_f.readline().strip()\n                        s1 = clean_s(s1, l1)\n                        s2 = clean_s(s2, l2)\n                        s1_len = len_s(s1, l1)\n                        s2_len = len_s(s2, l2)\n                        if min_len &gt; 0: # remove short sentence\n                            if s1_len &lt; min_len or s2_len &lt; min_len:\n                                continue\n                        if max_len &gt; 0: # remove long sentence\n                            if s1_len &gt; max_len or s2_len &gt; max_len:\n                                continue\n                        if ratio &gt; 0: # remove by ratio of length\n                            if s1_len\/s2_len &gt; ratio or s2_len\/s1_len &gt; ratio:\n                                continue\n                        print(s1, file=l1_out_f)\n                        print(s2, file=l2_out_f)<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>clean_corpus(data_prefix, src_lang, tgt_lang)\nclean_corpus(test_prefix, src_lang, tgt_lang, ratio=-1, min_len=-1, max_len=-1)<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>!head {data_prefix+'.clean.'+src_lang} -n 5\n!head {data_prefix+'.clean.'+tgt_lang} -n 5<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Split into train\/valid<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>valid_ratio = 0.01 # 3000~4000 would suffice\ntrain_ratio = 1 - valid_ratio<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>if (prefix\/f'train.clean.{src_lang}').exists() \\\nand (prefix\/f'train.clean.{tgt_lang}').exists() \\\nand (prefix\/f'valid.clean.{src_lang}').exists() \\\nand (prefix\/f'valid.clean.{tgt_lang}').exists():\n    print(f'train\/valid splits exists. skipping split.')\nelse:\n    line_num = sum(1 for line in open(f'{data_prefix}.clean.{src_lang}'))\n    labels = list(range(line_num))\n    random.shuffle(labels)\n    for lang in &#91;src_lang, tgt_lang]:\n        train_f = open(os.path.join(data_dir, dataset_name, f'train.clean.{lang}'), 'w')\n        valid_f = open(os.path.join(data_dir, dataset_name, f'valid.clean.{lang}'), 'w')\n        count = 0\n        for line in open(f'{data_prefix}.clean.{lang}', 'r'):\n            if labels&#91;count]\/line_num &lt; train_ratio:\n                train_f.write(line)\n            else:\n                valid_f.write(line)\n            count += 1\n        train_f.close()\n        valid_f.close()<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Subword Units<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Out of vocabulary (OOV) has been a major problem in machine translation. This can be alleviated by using subword units.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; We will use the [sentencepiece](#kudo-richardson-2018-sentencepiece) package<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; select &#8216;unigram&#8217; or &#8216;byte-pair encoding (BPE)&#8217; algorithm<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>import sentencepiece as spm\nvocab_size = 8000\nif (prefix\/f'spm{vocab_size}.model').exists():\n    print(f'{prefix}\/spm{vocab_size}.model exists. skipping spm_train.')\nelse:\n    spm.SentencePieceTrainer.train(\n        input=','.join(&#91;f'{prefix}\/train.clean.{src_lang}',\n                        f'{prefix}\/valid.clean.{src_lang}',\n                        f'{prefix}\/train.clean.{tgt_lang}',\n                        f'{prefix}\/valid.clean.{tgt_lang}']),\n        model_prefix=prefix\/f'spm{vocab_size}',\n        vocab_size=vocab_size,\n        character_coverage=1,\n        model_type='unigram', # 'bpe' works as well\n        input_sentence_size=1e6,\n        shuffle_input_sentence=True,\n        normalization_rule_name='nmt_nfkc_cf',\n    )<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>spm_model = spm.SentencePieceProcessor(model_file=str(prefix\/f'spm{vocab_size}.model'))\nin_tag = {\n    'train': 'train.clean',\n    'valid': 'valid.clean',\n    'test': 'test.raw.clean',\n}\nfor split in &#91;'train', 'valid', 'test']:\n    for lang in &#91;src_lang, tgt_lang]:\n        out_path = prefix\/f'{split}.{lang}'\n        if out_path.exists():\n            print(f\"{out_path} exists. skipping spm_encode.\")\n        else:\n            with open(prefix\/f'{split}.{lang}', 'w') as out_f:\n                with open(prefix\/f'{in_tag&#91;split]}.{lang}', 'r') as in_f:\n                    for line in in_f:\n                        line = line.strip()\n                        tok = spm_model.encode(line, out_type=str)\n                        print(' '.join(tok), file=out_f)<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>!head {data_dir+'\/'+dataset_name+'\/train.'+src_lang} -n 5\n!head {data_dir+'\/'+dataset_name+'\/train.'+tgt_lang} -n 5<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Binarize the data with fairseq<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Prepare the files in pairs for both the source and target languages. <\/p>\n\n\n\n<p class=\"wp-block-paragraph\">In case a pair is unavailable, generate a pseudo pair to facilitate binarization.<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>binpath = Path('.\/DATA\/data-bin', dataset_name)\nif binpath.exists():\n    print(binpath, \"exists, will not overwrite!\")\nelse:\n    !python -m fairseq_cli.preprocess \\\n        --source-lang {src_lang}\\\n        --target-lang {tgt_lang}\\\n        --trainpref {prefix\/'train'}\\\n        --validpref {prefix\/'valid'}\\\n        --testpref {prefix\/'test'}\\\n        --destdir {binpath}\\\n        --joined-dictionary\\\n        --workers 2<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Configuration for experiments<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>config = Namespace(\n    datadir = \".\/DATA\/data-bin\/ted2020\",\n    savedir = \".\/checkpoints\/rnn\",\n    source_lang = src_lang,\n    target_lang = tgt_lang,\n    \n    # cpu threads when fetching &amp; processing data.\n    num_workers=2,  \n    # batch size in terms of tokens. gradient accumulation increases the effective batchsize.\n    max_tokens=8192,\n    accum_steps=2,\n    \n    # the lr s calculated from Noam lr scheduler. you can tune the maximum lr by this factor.\n    lr_factor=2.,\n    lr_warmup=4000,\n    \n    # clipping gradient norm helps alleviate gradient exploding\n    clip_norm=1.0,\n    \n    # maximum epochs for training\n    max_epoch=15,\n    start_epoch=1,\n    \n    # beam size for beam search\n    beam=5, \n    # generate sequences of maximum length ax + b, where x is the source length\n    max_len_a=1.2, \n    max_len_b=10, \n    # when decoding, post process sentence by removing sentencepiece symbols and jieba tokenization.\n    post_process = \"sentencepiece\",\n    \n    # checkpoints\n    keep_last_epochs=5,\n    resume=None, # if resume from checkpoint name (under config.savedir)\n    \n    # logging\n    use_wandb=False,\n)<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Logging<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; logging package logs ordinary messages<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; wandb logs the loss, bleu, etc. in the training process<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>logging.basicConfig(\n    format=\"%(asctime)s | %(levelname)s | %(name)s | %(message)s\",\n    datefmt=\"%Y-%m-%d %H:%M:%S\",\n    level=\"INFO\", # \"DEBUG\" \"WARNING\" \"ERROR\"\n    stream=sys.stdout,\n)\nproj = \"hw5.seq2seq\"\nlogger = logging.getLogger(proj)\nif config.use_wandb:\n    import wandb\n    wandb.init(project=proj, name=Path(config.savedir).stem, config=config)<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>CUDA Environments<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>cuda_env = utils.CudaEnvironment()\nutils.CudaEnvironment.pretty_print_cuda_env_list(&#91;cuda_env])\ndevice = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Dataloading<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>We borrow the TranslationTask from fairseq<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">used to load the binarized data created above<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">well-implemented data iterator (dataloader)<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">built-in task.source_dictionary and task.target_dictionary are also handy<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">well-implemented beach search decoder<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>from fairseq.tasks.translation import TranslationConfig, TranslationTask\n\n## setup task\ntask_cfg = TranslationConfig(\n    data=config.datadir,\n    source_lang=config.source_lang,\n    target_lang=config.target_lang,\n    train_subset=\"train\",\n    required_seq_len_multiple=8,\n    dataset_impl=\"mmap\",\n    upsample_primary=1,\n)\ntask = TranslationTask.setup_task(task_cfg)<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>logger.info(\"loading data for epoch 1\")\ntask.load_dataset(split=\"train\", epoch=1, combine=True) # combine if you have back-translation data.\ntask.load_dataset(split=\"valid\", epoch=1)<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>sample = task.dataset(\"valid\")&#91;1]\npprint.pprint(sample)\npprint.pprint(\n    \"Source: \" + \\\n    task.source_dictionary.string(\n        sample&#91;'source'],\n        config.post_process,\n    )\n)\npprint.pprint(\n    \"Target: \" + \\\n    task.target_dictionary.string(\n        sample&#91;'target'],\n        config.post_process,\n    )\n)<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Dataset iterator<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">* Controls every batch to contain no more than N tokens, which optimizes GPU memory efficiency<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">* Shuffles the training set for every epoch<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">* Ignore sentences exceeding maximum length<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">* Pad all sentences in a batch to the same length, which enables parallel computing by GPU<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">* Add eos and shift one token<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8211; teacher forcing: to train the model to predict the next token based on prefix, we feed the right shifted target sequence as the decoder input.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8211; generally, prepending bos to the target would do the job (as shown below)<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">![seq2seq](https:\/\/i.imgur.com\/0zeDyuI.png)<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8211; in fairseq however, this is done by moving the eos token to the begining. Empirically, this has the same effect. For instance:<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8220;`<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; # output target (target) and Decoder input (prev_output_tokens):<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;eos = 2<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; target = 419, &nbsp;711, &nbsp;238, &nbsp;888, &nbsp;792, &nbsp; 60, &nbsp;968, &nbsp; &nbsp;8, &nbsp; &nbsp;2<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; prev_output_tokens = 2, &nbsp;419, &nbsp;711, &nbsp;238, &nbsp;888, &nbsp;792, &nbsp; 60, &nbsp;968, &nbsp; &nbsp;8<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8220;`<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>def load_data_iterator(task, split, epoch=1, max_tokens=4000, num_workers=1, cached=True):\n    batch_iterator = task.get_batch_iterator(\n        dataset=task.dataset(split),\n        max_tokens=max_tokens,\n        max_sentences=None,\n        max_positions=utils.resolve_max_positions(\n            task.max_positions(),\n            max_tokens,\n        ),\n        ignore_invalid_inputs=True,\n        seed=seed,\n        num_workers=num_workers,\n        epoch=epoch,\n        disable_iterator_cache=not cached,\n        # Set this to False to speed up. However, if set to False, changing max_tokens beyond \n        # first call of this method has no effect. \n    )\n    return batch_iterator\n\ndemo_epoch_obj = load_data_iterator(task, \"valid\", epoch=1, max_tokens=20, num_workers=1, cached=False)\ndemo_iter = demo_epoch_obj.next_epoch_itr(shuffle=True)\nsample = next(demo_iter)\nsample<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\">* each batch is a python dict, with string key and Tensor value. Contents are described below:<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8220;`python<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">batch = {<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8220;id&#8221;: id, # id for each example<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8220;nsentences&#8221;: len(samples), # batch size (sentences)<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8220;ntokens&#8221;: ntokens, # batch size (tokens)<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8220;net_input&#8221;: {<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &nbsp; &nbsp; &#8220;src_tokens&#8221;: src_tokens, # sequence in source language<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &nbsp; &nbsp; &#8220;src_lengths&#8221;: src_lengths, # sequence length of each example before padding<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &nbsp; &nbsp; &#8220;prev_output_tokens&#8221;: prev_output_tokens, # right shifted target, as mentioned above.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; },<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8220;target&#8221;: target, # target sequence<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">}<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8220;`<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Model Architecture<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">* We again inherit fairseq&#8217;s encoder, decoder and model, so that in the testing phase we can directly leverage fairseq&#8217;s beam search decoder.<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>from fairseq.models import (\n    FairseqEncoder, \n    FairseqIncrementalDecoder,\n    FairseqEncoderDecoderModel\n)<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Encoder<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; The Encoder is a RNN or Transformer Encoder. The following description is for RNN. For every input token, Encoder will generate a output vector and a hidden states vector, and the hidden states vector is passed on to the next step. In other words, the Encoder sequentially reads in the input sequence, and outputs a single vector at each timestep, then finally outputs the final hidden states, or content vector, at the last timestep.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; Parameters:<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &#8211; <em>*args*<\/em><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &nbsp; &#8211; encoder_embed_dim: the dimension of embeddings, this compresses the one-hot vector into fixed dimensions, which achieves dimension reduction<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &nbsp; &#8211; encoder_ffn_embed_dim is the dimension of hidden states and output vectors<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &nbsp; &#8211; encoder_layers is the number of layers for Encoder RNN<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &nbsp; &#8211; dropout determines the probability of a neuron&#8217;s activation being set to 0, in order to prevent overfitting. Generally this is applied in training, and removed in testing.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &#8211; <em>*dictionary*<\/em>: the dictionary provided by fairseq. it&#8217;s used to obtain the padding index, and in turn the encoder padding mask.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &#8211; <em>*embed_tokens*<\/em>: an instance of token embeddings (nn.Embedding)<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; Inputs:<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8211; <em>*src_tokens*<\/em>: integer sequence representing english e.g. 1, 28, 29, 205, 2<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; Outputs:<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8211; <em>*outputs*<\/em>: the output of RNN at each timestep, can be furthur processed by Attention<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8211; <em>*final_hiddens*<\/em>: the hidden states of each timestep, will be passed to decoder for decoding<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8211; <em>*encoder_padding_mask*<\/em>: this tells the decoder which position to ignore<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>class RNNEncoder(FairseqEncoder):\n    def __init__(self, args, dictionary, embed_tokens):\n        super().__init__(dictionary)\n        self.embed_tokens = embed_tokens\n        \n        self.embed_dim = args.encoder_embed_dim\n        self.hidden_dim = args.encoder_ffn_embed_dim\n        self.num_layers = args.encoder_layers\n        \n        self.dropout_in_module = nn.Dropout(args.dropout)\n        self.rnn = nn.GRU(\n            self.embed_dim, \n            self.hidden_dim, \n            self.num_layers, \n            dropout=args.dropout, \n            batch_first=False, \n            bidirectional=True\n        )\n        self.dropout_out_module = nn.Dropout(args.dropout)\n        \n        self.padding_idx = dictionary.pad()\n        \n    def combine_bidir(self, outs, bsz: int):\n        out = outs.view(self.num_layers, 2, bsz, -1).transpose(1, 2).contiguous()\n        return out.view(self.num_layers, bsz, -1)\n\n    def forward(self, src_tokens, **unused):\n        bsz, seqlen = src_tokens.size()\n        \n        # get embeddings\n        x = self.embed_tokens(src_tokens)\n        x = self.dropout_in_module(x)\n\n        # B x T x C -&gt; T x B x C\n        x = x.transpose(0, 1)\n        \n        # pass thru bidirectional RNN\n        h0 = x.new_zeros(2 * self.num_layers, bsz, self.hidden_dim)\n        x, final_hiddens = self.rnn(x, h0)\n        outputs = self.dropout_out_module(x)\n        # outputs = &#91;sequence len, batch size, hid dim * directions]\n        # hidden =  &#91;num_layers * directions, batch size  , hid dim]\n        \n        # Since Encoder is bidirectional, we need to concatenate the hidden states of two directions\n        final_hiddens = self.combine_bidir(final_hiddens, bsz)\n        # hidden =  &#91;num_layers x batch x num_directions*hidden]\n        \n        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()\n        return tuple(\n            (\n                outputs,  # seq_len x batch x hidden\n                final_hiddens,  # num_layers x batch x num_directions*hidden\n                encoder_padding_mask,  # seq_len x batch\n            )\n        )\n    \n    def reorder_encoder_out(self, encoder_out, new_order):\n        # This is used by fairseq's beam search. How and why is not particularly important here.\n        return tuple(\n            (\n                encoder_out&#91;0].index_select(1, new_order),\n                encoder_out&#91;1].index_select(1, new_order),\n                encoder_out&#91;2].index_select(1, new_order),\n            )\n        )<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Attention<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; When the input sequence is long, &#8220;content vector&#8221; alone cannot accurately represent the whole sequence, attention mechanism can provide the Decoder more information.<br>&#8211; According to the **Decoder embeddings** of the current timestep, match the **Encoder outputs** with decoder embeddings to determine correlation, and then sum the Encoder outputs weighted by the correlation as the input to **Decoder** RNN.<br>&#8211; Common attention implementations use neural network \/ dot product as the correlation between **query** (decoder embeddings) and **key** (Encoder outputs), followed by **softmax**  to obtain a distribution, and finally **values** (Encoder outputs) is **weighted sum**-ed by said distribution.<br><br>&#8211; Parameters:<br>  &#8211; *input_embed_dim*: dimensionality of key, should be that of the vector in decoder to attend others<br>  &#8211; *source_embed_dim*: dimensionality of query, should be that of the vector to be attended to (encoder outputs)<br>  &#8211; *output_embed_dim*: dimensionality of value, should be that of the vector after attention, expected by the next layer<br><br>&#8211; Inputs: <br>    &#8211; *inputs*: is the key, the vector to attend to others<br>    &#8211; *encoder_outputs*:  is the query\/value, the vector to be attended to<br>    &#8211; *encoder_padding_mask*: this tells the decoder which position to ignore<br>&#8211; Outputs: <br>    &#8211; *output*: the context vector after attention<br>    &#8211; *attention score*: the attention distribution<br><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>class AttentionLayer(nn.Module):\n    def __init__(self, input_embed_dim, source_embed_dim, output_embed_dim, bias=False):\n        super().__init__()\n\n        self.input_proj = nn.Linear(input_embed_dim, source_embed_dim, bias=bias)\n        self.output_proj = nn.Linear(\n            input_embed_dim + source_embed_dim, output_embed_dim, bias=bias\n        )\n\n    def forward(self, inputs, encoder_outputs, encoder_padding_mask):\n        # inputs: T, B, dim\n        # encoder_outputs: S x B x dim\n        # padding mask:  S x B\n        \n        # convert all to batch first\n        inputs = inputs.transpose(1,0) # B, T, dim\n        encoder_outputs = encoder_outputs.transpose(1,0) # B, S, dim\n        encoder_padding_mask = encoder_padding_mask.transpose(1,0) # B, S\n        \n        # project to the dimensionality of encoder_outputs\n        x = self.input_proj(inputs)\n\n        # compute attention\n        # (B, T, dim) x (B, dim, S) = (B, T, S)\n        attn_scores = torch.bmm(x, encoder_outputs.transpose(1,2))\n\n        # cancel the attention at positions corresponding to padding\n        if encoder_padding_mask is not None:\n            # leveraging broadcast  B, S -&gt; (B, 1, S)\n            encoder_padding_mask = encoder_padding_mask.unsqueeze(1)\n            attn_scores = (\n                attn_scores.float()\n                .masked_fill_(encoder_padding_mask, float(\"-inf\"))\n                .type_as(attn_scores)\n            )  # FP16 support: cast to float and back\n\n        # softmax on the dimension corresponding to source sequence\n        attn_scores = F.softmax(attn_scores, dim=-1)\n\n        # shape (B, T, S) x (B, S, dim) = (B, T, dim) weighted sum\n        x = torch.bmm(attn_scores, encoder_outputs)\n\n        # (B, T, dim)\n        x = torch.cat((x, inputs), dim=-1)\n        x = torch.tanh(self.output_proj(x)) # concat + linear + tanh\n        \n        # restore shape (B, T, dim) -&gt; (T, B, dim)\n        return x.transpose(1,0), attn_scores<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Decoder<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">* The hidden states of <strong>**Decoder**<\/strong> will be initialized by the final hidden states of <strong>**Encoder**<\/strong> (the content vector)<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">* At the same time, <strong>**Decoder**<\/strong> will change its hidden states based on the input of the current timestep (the outputs of previous timesteps), and generates an output<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">* Attention improves the performance<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">* The seq2seq steps are implemented in decoder, so that later the Seq2Seq class can accept RNN and Transformer, without furthur modification.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; Parameters:<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &#8211; <em>*args*<\/em><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &nbsp; &#8211; decoder_embed_dim: is the dimensionality of the decoder embeddings, similar to encoder_embed_dim\uff0c<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &nbsp; &#8211; decoder_ffn_embed_dim: is the dimensionality of the decoder RNN hidden states, similar to encoder_ffn_embed_dim<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &nbsp; &#8211; decoder_layers: number of layers of RNN decoder<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &nbsp; &#8211; share_decoder_input_output_embed: usually, the projection matrix of the decoder will share weights with the decoder input embeddings<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &#8211; <em>*dictionary*<\/em>: the dictionary provided by fairseq<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &#8211; <em>*embed_tokens*<\/em>: an instance of token embeddings (nn.Embedding)<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; Inputs:<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8211; <em>*prev_output_tokens*<\/em>: integer sequence representing the right-shifted target e.g. 1, 28, 29, 205, 2<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8211; <em>*encoder_out*<\/em>: encoder&#8217;s output.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8211; <em>*incremental_state*<\/em>: in order to speed up decoding during test time, we will save the hidden state of each timestep. see forward() for details.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; Outputs:<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8211; <em>*outputs*<\/em>: the logits (before softmax) output of decoder for each timesteps<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&nbsp; &nbsp; &#8211; <em>*extra*<\/em>: unsused<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>class RNNDecoder(FairseqIncrementalDecoder):\n    def __init__(self, args, dictionary, embed_tokens):\n        super().__init__(dictionary)\n        self.embed_tokens = embed_tokens\n        \n        assert args.decoder_layers == args.encoder_layers, f\"\"\"seq2seq rnn requires that encoder \n        and decoder have same layers of rnn. got: {args.encoder_layers, args.decoder_layers}\"\"\"\n        assert args.decoder_ffn_embed_dim == args.encoder_ffn_embed_dim*2, f\"\"\"seq2seq-rnn requires \n        that decoder hidden to be 2*encoder hidden dim. got: {args.decoder_ffn_embed_dim, args.encoder_ffn_embed_dim*2}\"\"\"\n        \n        self.embed_dim = args.decoder_embed_dim\n        self.hidden_dim = args.decoder_ffn_embed_dim\n        self.num_layers = args.decoder_layers\n        \n        \n        self.dropout_in_module = nn.Dropout(args.dropout)\n        self.rnn = nn.GRU(\n            self.embed_dim, \n            self.hidden_dim, \n            self.num_layers, \n            dropout=args.dropout, \n            batch_first=False, \n            bidirectional=False\n        )\n        self.attention = AttentionLayer(\n            self.embed_dim, self.hidden_dim, self.embed_dim, bias=False\n        ) \n        # self.attention = None\n        self.dropout_out_module = nn.Dropout(args.dropout)\n        \n        if self.hidden_dim != self.embed_dim:\n            self.project_out_dim = nn.Linear(self.hidden_dim, self.embed_dim)\n        else:\n            self.project_out_dim = None\n        \n        if args.share_decoder_input_output_embed:\n            self.output_projection = nn.Linear(\n                self.embed_tokens.weight.shape&#91;1],\n                self.embed_tokens.weight.shape&#91;0],\n                bias=False,\n            )\n            self.output_projection.weight = self.embed_tokens.weight\n        else:\n            self.output_projection = nn.Linear(\n                self.output_embed_dim, len(dictionary), bias=False\n            )\n            nn.init.normal_(\n                self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5\n            )\n        \n    def forward(self, prev_output_tokens, encoder_out, incremental_state=None, **unused):\n        # extract the outputs from encoder\n        encoder_outputs, encoder_hiddens, encoder_padding_mask = encoder_out\n        # outputs:          seq_len x batch x num_directions*hidden\n        # encoder_hiddens:  num_layers x batch x num_directions*encoder_hidden\n        # padding_mask:     seq_len x batch\n        \n        if incremental_state is not None and len(incremental_state) &gt; 0:\n            # if the information from last timestep is retained, we can continue from there instead of starting from bos\n            prev_output_tokens = prev_output_tokens&#91;:, -1:]\n            cache_state = self.get_incremental_state(incremental_state, \"cached_state\")\n            prev_hiddens = cache_state&#91;\"prev_hiddens\"]\n        else:\n            # incremental state does not exist, either this is training time, or the first timestep of test time\n            # prepare for seq2seq: pass the encoder_hidden to the decoder hidden states\n            prev_hiddens = encoder_hiddens\n        \n        bsz, seqlen = prev_output_tokens.size()\n        \n        # embed tokens\n        x = self.embed_tokens(prev_output_tokens)\n        x = self.dropout_in_module(x)\n\n        # B x T x C -&gt; T x B x C\n        x = x.transpose(0, 1)\n                \n        # decoder-to-encoder attention\n        if self.attention is not None:\n            x, attn = self.attention(x, encoder_outputs, encoder_padding_mask)\n                        \n        # pass thru unidirectional RNN\n        x, final_hiddens = self.rnn(x, prev_hiddens)\n        # outputs = &#91;sequence len, batch size, hid dim]\n        # hidden =  &#91;num_layers * directions, batch size  , hid dim]\n        x = self.dropout_out_module(x)\n                \n        # project to embedding size (if hidden differs from embed size, and share_embedding is True, \n        # we need to do an extra projection)\n        if self.project_out_dim != None:\n            x = self.project_out_dim(x)\n        \n        # project to vocab size\n        x = self.output_projection(x)\n        \n        # T x B x C -&gt; B x T x C\n        x = x.transpose(1, 0)\n        \n        # if incremental, record the hidden states of current timestep, which will be restored in the next timestep\n        cache_state = {\n            \"prev_hiddens\": final_hiddens,\n        }\n        self.set_incremental_state(incremental_state, \"cached_state\", cache_state)\n        \n        return x, None\n    \n    def reorder_incremental_state(\n        self,\n        incremental_state,\n        new_order,\n    ):\n        # This is used by fairseq's beam search. How and why is not particularly important here.\n        cache_state = self.get_incremental_state(incremental_state, \"cached_state\")\n        prev_hiddens = cache_state&#91;\"prev_hiddens\"]\n        prev_hiddens = &#91;p.index_select(0, new_order) for p in prev_hiddens]\n        cache_state = {\n            \"prev_hiddens\": torch.stack(prev_hiddens),\n        }\n        self.set_incremental_state(incremental_state, \"cached_state\", cache_state)\n        return<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Seq2Seq<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; Composed of <strong>**Encoder**<\/strong> and <strong>**Decoder**<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; Recieves inputs and pass to <strong>**Encoder**<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; Pass the outputs from <strong>**Encoder**<\/strong> to <strong>**Decoder**<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; <strong>**Decoder**<\/strong> will decode according to outputs of previous timesteps as well as <strong>**Encoder**<\/strong> outputs &nbsp;<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; Once done decoding, return the <strong>**Decoder**<\/strong> outputs<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>class Seq2Seq(FairseqEncoderDecoderModel):\n    def __init__(self, args, encoder, decoder):\n        super().__init__(encoder, decoder)\n        self.args = args\n    \n    def forward(\n        self,\n        src_tokens,\n        src_lengths,\n        prev_output_tokens,\n        return_all_hiddens: bool = True,\n    ):\n        \"\"\"\n        Run the forward pass for an encoder-decoder model.\n        \"\"\"\n        encoder_out = self.encoder(\n            src_tokens, src_lengths=src_lengths, return_all_hiddens=return_all_hiddens\n        )\n        logits, extra = self.decoder(\n            prev_output_tokens,\n            encoder_out=encoder_out,\n            src_lengths=src_lengths,\n            return_all_hiddens=return_all_hiddens,\n        )\n        return logits, extra<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Model Initialization<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># # HINT: transformer architecture\nfrom fairseq.models.transformer import (\n    TransformerEncoder, \n    TransformerDecoder,\n)\n\ndef build_model(args, task):\n    \"\"\" build a model instance based on hyperparameters \"\"\"\n    src_dict, tgt_dict = task.source_dictionary, task.target_dictionary\n\n    # token embeddings\n    encoder_embed_tokens = nn.Embedding(len(src_dict), args.encoder_embed_dim, src_dict.pad())\n    decoder_embed_tokens = nn.Embedding(len(tgt_dict), args.decoder_embed_dim, tgt_dict.pad())\n    \n    # encoder decoder\n    # HINT: TODO: switch to TransformerEncoder &amp; TransformerDecoder\n    encoder = RNNEncoder(args, src_dict, encoder_embed_tokens)\n    decoder = RNNDecoder(args, tgt_dict, decoder_embed_tokens)\n    # encoder = TransformerEncoder(args, src_dict, encoder_embed_tokens)\n    # decoder = TransformerDecoder(args, tgt_dict, decoder_embed_tokens)\n\n    # sequence to sequence model\n    model = Seq2Seq(args, encoder, decoder)\n    \n    # initialization for seq2seq model is important, requires extra handling\n    def init_params(module):\n        from fairseq.modules import MultiheadAttention\n        if isinstance(module, nn.Linear):\n            module.weight.data.normal_(mean=0.0, std=0.02)\n            if module.bias is not None:\n                module.bias.data.zero_()\n        if isinstance(module, nn.Embedding):\n            module.weight.data.normal_(mean=0.0, std=0.02)\n            if module.padding_idx is not None:\n                module.weight.data&#91;module.padding_idx].zero_()\n        if isinstance(module, MultiheadAttention):\n            module.q_proj.weight.data.normal_(mean=0.0, std=0.02)\n            module.k_proj.weight.data.normal_(mean=0.0, std=0.02)\n            module.v_proj.weight.data.normal_(mean=0.0, std=0.02)\n        if isinstance(module, nn.RNNBase):\n            for name, param in module.named_parameters():\n                if \"weight\" in name or \"bias\" in name:\n                    param.data.uniform_(-0.1, 0.1)\n            \n    # weight initialization\n    model.apply(init_params)\n    return model<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Architecture Related Configuration<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">For strong baseline, please refer to the hyperparameters for <em>*transformer-base*<\/em> in Table 3 in [Attention is all you need](#vaswani2017)<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>arch_args = Namespace(\n    encoder_embed_dim=256,\n    encoder_ffn_embed_dim=512,\n    encoder_layers=1,\n    decoder_embed_dim=256,\n    decoder_ffn_embed_dim=1024,\n    decoder_layers=1,\n    share_decoder_input_output_embed=True,\n    dropout=0.3,\n)\n\n# HINT: these patches on parameters for Transformer\ndef add_transformer_args(args):\n    args.encoder_attention_heads=4\n    args.encoder_normalize_before=True\n    \n    args.decoder_attention_heads=4\n    args.decoder_normalize_before=True\n    \n    args.activation_fn=\"relu\"\n    args.max_source_positions=1024\n    args.max_target_positions=1024\n    \n    # patches on default parameters for Transformer (those not set above)\n    from fairseq.models.transformer import base_architecture\n    base_architecture(arch_args)\n\n# add_transformer_args(arch_args)<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>if config.use_wandb:\n    wandb.config.update(vars(arch_args))<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>model = build_model(arch_args, task)\nlogger.info(model)<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Optimization<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Loss: Label Smoothing Regularization<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">* let the model learn to generate less concentrated distribution, and prevent over-confidence<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">* sometimes the ground truth may not be the only answer. thus, when calculating loss, we reserve some probability for incorrect labels<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">* avoids overfitting<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">code [source](https:\/\/fairseq.readthedocs.io\/en\/latest\/_modules\/fairseq\/criterions\/label_smoothed_cross_entropy.html)<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>class LabelSmoothedCrossEntropyCriterion(nn.Module):\n    def __init__(self, smoothing, ignore_index=None, reduce=True):\n        super().__init__()\n        self.smoothing = smoothing\n        self.ignore_index = ignore_index\n        self.reduce = reduce\n    \n    def forward(self, lprobs, target):\n        if target.dim() == lprobs.dim() - 1:\n            target = target.unsqueeze(-1)\n        # nll: Negative log likelihood\uff0cthe cross-entropy when target is one-hot. following line is same as F.nll_loss\n        nll_loss = -lprobs.gather(dim=-1, index=target)\n        #  reserve some probability for other labels. thus when calculating cross-entropy, \n        # equivalent to summing the log probs of all labels\n        smooth_loss = -lprobs.sum(dim=-1, keepdim=True)\n        if self.ignore_index is not None:\n            pad_mask = target.eq(self.ignore_index)\n            nll_loss.masked_fill_(pad_mask, 0.0)\n            smooth_loss.masked_fill_(pad_mask, 0.0)\n        else:\n            nll_loss = nll_loss.squeeze(-1)\n            smooth_loss = smooth_loss.squeeze(-1)\n        if self.reduce:\n            nll_loss = nll_loss.sum()\n            smooth_loss = smooth_loss.sum()\n        # when calculating cross-entropy, add the loss of other labels\n        eps_i = self.smoothing \/ lprobs.size(-1)\n        loss = (1.0 - self.smoothing) * nll_loss + eps_i * smooth_loss\n        return loss\n\n# generally, 0.1 is good enough\ncriterion = LabelSmoothedCrossEntropyCriterion(\n    smoothing=0.1,\n    ignore_index=task.target_dictionary.pad(),\n)<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Optimizer: Adam + lr scheduling<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Inverse square root scheduling is important to the stability when training Transformer. It&#8217;s later used on RNN as well.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Update the learning rate according to the following equation. Linearly increase the first stage, then decay proportionally to the inverse square root of timestep.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">$$lrate = d_{\\text{model}}^{-0.5}\\cdot\\min({step\\_num}^{-0.5},{step\\_num}\\cdot{warmup\\_steps}^{-1.5})$$<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>def get_rate(d_model, step_num, warmup_step):\n    # TODO: Change lr from constant to the equation shown above\n    lr = 0.001\n    return lr<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>class NoamOpt:\n    \"Optim wrapper that implements rate.\"\n    def __init__(self, model_size, factor, warmup, optimizer):\n        self.optimizer = optimizer\n        self._step = 0\n        self.warmup = warmup\n        self.factor = factor\n        self.model_size = model_size\n        self._rate = 0\n    \n    @property\n    def param_groups(self):\n        return self.optimizer.param_groups\n        \n    def multiply_grads(self, c):\n        \"\"\"Multiplies grads by a constant *c*.\"\"\"                \n        for group in self.param_groups:\n            for p in group&#91;'params']:\n                if p.grad is not None:\n                    p.grad.data.mul_(c)\n        \n    def step(self):\n        \"Update parameters and rate\"\n        self._step += 1\n        rate = self.rate()\n        for p in self.param_groups:\n            p&#91;'lr'] = rate\n        self._rate = rate\n        self.optimizer.step()\n        \n    def rate(self, step = None):\n        \"Implement `lrate` above\"\n        if step is None:\n            step = self._step\n        return 0 if not step else self.factor * get_rate(self.model_size, step, self.warmup)<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Scheduling Visualized<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>optimizer = NoamOpt(\n    model_size=arch_args.encoder_embed_dim, \n    factor=config.lr_factor, \n    warmup=config.lr_warmup, \n    optimizer=torch.optim.AdamW(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9, weight_decay=0.0001))\nplt.plot(np.arange(1, 100000), &#91;optimizer.rate(i) for i in range(1, 100000)])\nplt.legend(&#91;f\"{optimizer.model_size}:{optimizer.warmup}\"])\nNone<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Training Procedure<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Training<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>from fairseq.data import iterators\nfrom torch.cuda.amp import GradScaler, autocast\n\ndef train_one_epoch(epoch_itr, model, task, criterion, optimizer, accum_steps=1):\n    itr = epoch_itr.next_epoch_itr(shuffle=True)\n    itr = iterators.GroupedIterator(itr, accum_steps) # gradient accumulation: update every accum_steps samples\n    \n    stats = {\"loss\": &#91;]}\n    scaler = GradScaler() # automatic mixed precision (amp) \n    \n    model.train()\n    progress = tqdm.tqdm(itr, desc=f\"train epoch {epoch_itr.epoch}\", leave=False)\n    for samples in progress:\n        model.zero_grad()\n        accum_loss = 0\n        sample_size = 0\n        # gradient accumulation: update every accum_steps samples\n        for i, sample in enumerate(samples):\n            if i == 1:\n                # emptying the CUDA cache after the first step can reduce the chance of OOM\n                torch.cuda.empty_cache()\n\n            sample = utils.move_to_cuda(sample, device=device)\n            target = sample&#91;\"target\"]\n            sample_size_i = sample&#91;\"ntokens\"]\n            sample_size += sample_size_i\n            \n            # mixed precision training\n            with autocast():\n                net_output = model.forward(**sample&#91;\"net_input\"])\n                lprobs = F.log_softmax(net_output&#91;0], -1)            \n                loss = criterion(lprobs.view(-1, lprobs.size(-1)), target.view(-1))\n                \n                # logging\n                accum_loss += loss.item()\n                # back-prop\n                scaler.scale(loss).backward()                \n        \n        scaler.unscale_(optimizer)\n        optimizer.multiply_grads(1 \/ (sample_size or 1.0)) # (sample_size or 1.0) handles the case of a zero gradient\n        gnorm = nn.utils.clip_grad_norm_(model.parameters(), config.clip_norm) # grad norm clipping prevents gradient exploding\n        \n        scaler.step(optimizer)\n        scaler.update()\n        \n        # logging\n        loss_print = accum_loss\/sample_size\n        stats&#91;\"loss\"].append(loss_print)\n        progress.set_postfix(loss=loss_print)\n        if config.use_wandb:\n            wandb.log({\n                \"train\/loss\": loss_print,\n                \"train\/grad_norm\": gnorm.item(),\n                \"train\/lr\": optimizer.rate(),\n                \"train\/sample_size\": sample_size,\n            })\n        \n    loss_print = np.mean(stats&#91;\"loss\"])\n    logger.info(f\"training loss: {loss_print:.4f}\")\n    return stats<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Validation &amp; Inference<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">To prevent overfitting, validation is required every epoch to validate the performance on unseen data.<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; the procedure is essensially same as training, with the addition of inference step<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; after validation we can save the model weights<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Validation loss alone cannot describe the actual performance of the model<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; Directly produce translation hypotheses based on current model, then calculate BLEU with the reference translation<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; We can also manually examine the hypotheses&#8217; quality<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">&#8211; We use fairseq&#8217;s sequence generator for beam search to generate translation hypotheses<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># fairseq's beam search generator\n# given model and input seqeunce, produce translation hypotheses by beam search\nsequence_generator = task.build_generator(&#91;model], config)\n\ndef decode(toks, dictionary):\n    # convert from Tensor to human readable sentence\n    s = dictionary.string(\n        toks.int().cpu(),\n        config.post_process,\n    )\n    return s if s else \"&lt;unk&gt;\"\n\ndef inference_step(sample, model):\n    gen_out = sequence_generator.generate(&#91;model], sample)\n    srcs = &#91;]\n    hyps = &#91;]\n    refs = &#91;]\n    for i in range(len(gen_out)):\n        # for each sample, collect the input, hypothesis and reference, later be used to calculate BLEU\n        srcs.append(decode(\n            utils.strip_pad(sample&#91;\"net_input\"]&#91;\"src_tokens\"]&#91;i], task.source_dictionary.pad()), \n            task.source_dictionary,\n        ))\n        hyps.append(decode(\n            gen_out&#91;i]&#91;0]&#91;\"tokens\"], # 0 indicates using the top hypothesis in beam\n            task.target_dictionary,\n        ))\n        refs.append(decode(\n            utils.strip_pad(sample&#91;\"target\"]&#91;i], task.target_dictionary.pad()), \n            task.target_dictionary,\n        ))\n    return srcs, hyps, refs<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>import shutil\nimport sacrebleu\n\ndef validate(model, task, criterion, log_to_wandb=True):\n    logger.info('begin validation')\n    itr = load_data_iterator(task, \"valid\", 1, config.max_tokens, config.num_workers).next_epoch_itr(shuffle=False)\n    \n    stats = {\"loss\":&#91;], \"bleu\": 0, \"srcs\":&#91;], \"hyps\":&#91;], \"refs\":&#91;]}\n    srcs = &#91;]\n    hyps = &#91;]\n    refs = &#91;]\n    \n    model.eval()\n    progress = tqdm.tqdm(itr, desc=f\"validation\", leave=False)\n    with torch.no_grad():\n        for i, sample in enumerate(progress):\n            # validation loss\n            sample = utils.move_to_cuda(sample, device=device)\n            net_output = model.forward(**sample&#91;\"net_input\"])\n\n            lprobs = F.log_softmax(net_output&#91;0], -1)\n            target = sample&#91;\"target\"]\n            sample_size = sample&#91;\"ntokens\"]\n            loss = criterion(lprobs.view(-1, lprobs.size(-1)), target.view(-1)) \/ sample_size\n            progress.set_postfix(valid_loss=loss.item())\n            stats&#91;\"loss\"].append(loss)\n            \n            # do inference\n            s, h, r = inference_step(sample, model)\n            srcs.extend(s)\n            hyps.extend(h)\n            refs.extend(r)\n            \n    tok = 'zh' if task.cfg.target_lang == 'zh' else '13a'\n    stats&#91;\"loss\"] = torch.stack(stats&#91;\"loss\"]).mean().item()\n    stats&#91;\"bleu\"] = sacrebleu.corpus_bleu(hyps, &#91;refs], tokenize=tok) # \u8a08\u7b97BLEU score\n    stats&#91;\"srcs\"] = srcs\n    stats&#91;\"hyps\"] = hyps\n    stats&#91;\"refs\"] = refs\n    \n    if config.use_wandb and log_to_wandb:\n        wandb.log({\n            \"valid\/loss\": stats&#91;\"loss\"],\n            \"valid\/bleu\": stats&#91;\"bleu\"].score,\n        }, commit=False)\n    \n    showid = np.random.randint(len(hyps))\n    logger.info(\"example source: \" + srcs&#91;showid])\n    logger.info(\"example hypothesis: \" + hyps&#91;showid])\n    logger.info(\"example reference: \" + refs&#91;showid])\n    \n    # show bleu results\n    logger.info(f\"validation loss:\\t{stats&#91;'loss']:.4f}\")\n    logger.info(stats&#91;\"bleu\"].format())\n    return stats<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Save and Load Model Weights<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>def validate_and_save(model, task, criterion, optimizer, epoch, save=True):   \n    stats = validate(model, task, criterion)\n    bleu = stats&#91;'bleu']\n    loss = stats&#91;'loss']\n    if save:\n        # save epoch checkpoints\n        savedir = Path(config.savedir).absolute()\n        savedir.mkdir(parents=True, exist_ok=True)\n        \n        check = {\n            \"model\": model.state_dict(),\n            \"stats\": {\"bleu\": bleu.score, \"loss\": loss},\n            \"optim\": {\"step\": optimizer._step}\n        }\n        torch.save(check, savedir\/f\"checkpoint{epoch}.pt\")\n        shutil.copy(savedir\/f\"checkpoint{epoch}.pt\", savedir\/f\"checkpoint_last.pt\")\n        logger.info(f\"saved epoch checkpoint: {savedir}\/checkpoint{epoch}.pt\")\n    \n        # save epoch samples\n        with open(savedir\/f\"samples{epoch}.{config.source_lang}-{config.target_lang}.txt\", \"w\") as f:\n            for s, h in zip(stats&#91;\"srcs\"], stats&#91;\"hyps\"]):\n                f.write(f\"{s}\\t{h}\\n\")\n\n        # get best valid bleu    \n        if getattr(validate_and_save, \"best_bleu\", 0) &lt; bleu.score:\n            validate_and_save.best_bleu = bleu.score\n            torch.save(check, savedir\/f\"checkpoint_best.pt\")\n            \n        del_file = savedir \/ f\"checkpoint{epoch - config.keep_last_epochs}.pt\"\n        if del_file.exists():\n            del_file.unlink()\n    \n    return stats\n\ndef try_load_checkpoint(model, optimizer=None, name=None):\n    name = name if name else \"checkpoint_last.pt\"\n    checkpath = Path(config.savedir)\/name\n    if checkpath.exists():\n        check = torch.load(checkpath)\n        model.load_state_dict(check&#91;\"model\"])\n        stats = check&#91;\"stats\"]\n        step = \"unknown\"\n        if optimizer != None:\n            optimizer._step = step = check&#91;\"optim\"]&#91;\"step\"]\n        logger.info(f\"loaded checkpoint {checkpath}: step={step} loss={stats&#91;'loss']} bleu={stats&#91;'bleu']}\")\n    else:\n        logger.info(f\"no checkpoints found at {checkpath}!\")<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Main<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Training loop<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>model = model.to(device=device)\ncriterion = criterion.to(device=device)<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>logger.info(\"task: {}\".format(task.__class__.__name__))\nlogger.info(\"encoder: {}\".format(model.encoder.__class__.__name__))\nlogger.info(\"decoder: {}\".format(model.decoder.__class__.__name__))\nlogger.info(\"criterion: {}\".format(criterion.__class__.__name__))\nlogger.info(\"optimizer: {}\".format(optimizer.__class__.__name__))\nlogger.info(\n    \"num. model params: {:,} (num. trained: {:,})\".format(\n        sum(p.numel() for p in model.parameters()),\n        sum(p.numel() for p in model.parameters() if p.requires_grad),\n    )\n)\nlogger.info(f\"max tokens per batch = {config.max_tokens}, accumulate steps = {config.accum_steps}\")<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>epoch_itr = load_data_iterator(task, \"train\", config.start_epoch, config.max_tokens, config.num_workers)\ntry_load_checkpoint(model, optimizer, name=config.resume)\nwhile epoch_itr.next_epoch_idx &lt;= config.max_epoch:\n    # train for one epoch\n    train_one_epoch(epoch_itr, model, task, criterion, optimizer, config.accum_steps)\n    stats = validate_and_save(model, task, criterion, optimizer, epoch=epoch_itr.epoch)\n    logger.info(\"end of epoch {}\".format(epoch_itr.epoch))    \n    epoch_itr = load_data_iterator(task, \"train\", epoch_itr.next_epoch_idx, config.max_tokens, config.num_workers)<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Submission<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># averaging a few checkpoints can have a similar effect to ensemble\ncheckdir=config.savedir\n!python .\/fairseq\/scripts\/average_checkpoints.py \\\n--inputs {checkdir} \\\n--num-epoch-checkpoints 5 \\\n--output {checkdir}\/avg_last_5_checkpoint.pt<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Confirm model weights used to generate submission<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># checkpoint_last.pt : latest epoch\n# checkpoint_best.pt : highest validation bleu\n# avg_last_5_checkpoint.pt: the average of last 5 epochs\ntry_load_checkpoint(model, name=\"avg_last_5_checkpoint.pt\")\nvalidate(model, task, criterion, log_to_wandb=False)\nNone<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Generate Prediction<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>def generate_prediction(model, task, split=\"test\", outfile=\".\/prediction.txt\"):    \n    task.load_dataset(split=split, epoch=1)\n    itr = load_data_iterator(task, split, 1, config.max_tokens, config.num_workers).next_epoch_itr(shuffle=False)\n    \n    idxs = &#91;]\n    hyps = &#91;]\n\n    model.eval()\n    progress = tqdm.tqdm(itr, desc=f\"prediction\")\n    with torch.no_grad():\n        for i, sample in enumerate(progress):\n            # validation loss\n            sample = utils.move_to_cuda(sample, device=device)\n\n            # do inference\n            s, h, r = inference_step(sample, model)\n            \n            hyps.extend(h)\n            idxs.extend(list(sample&#91;'id']))\n            \n    # sort based on the order before preprocess\n    hyps = &#91;x for _,x in sorted(zip(idxs,hyps))]\n    \n    with open(outfile, \"w\") as f:\n        for h in hyps:\n            f.write(h+\"\\n\")<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>generate_prediction(model, task)<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>raise<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Back-translation<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Train a backward translation model<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">1. Switch the source_lang and target_lang in <strong>**config**<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">2. Change the savedir in <strong>**config**<\/strong> (eg. &#8220;.\/checkpoints\/transformer-back&#8221;)<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">3. Train model<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Generate synthetic data with backward model<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Download monolingual data<\/strong><\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>mono_dataset_name = 'mono'<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code>mono_prefix = Path(data_dir).absolute() \/ mono_dataset_name\nmono_prefix.mkdir(parents=True, exist_ok=True)\n\nurls = (\n    \"https:\/\/github.com\/figisiwirf\/ml2023-hw5-dataset\/releases\/download\/v1.0.1\/ted_zh_corpus.deduped.gz\",\n)\nfile_names = (\n    'ted_zh_corpus.deduped.gz',\n)\n\nfor u, f in zip(urls, file_names):\n    path = mono_prefix\/f\n    if not path.exists():\n        !wget {u} -O {path}\n    else:\n        print(f'{f} is exist, skip downloading')\n    if path.suffix == \".tgz\":\n        !tar -xvf {path} -C {prefix}\n    elif path.suffix == \".zip\":\n        !unzip -o {path} -d {prefix}\n    elif path.suffix == \".gz\":\n        !gzip -fkd {path}<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>TODO: clean corpus<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">1. remove sentences that are too long or too short<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">2. unify punctuation<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">hint: you can use clean_s() defined above to do this<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">++++++<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>TODO: Subword Units<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Use the spm model of the backward model to tokenize the data into subword units<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">hint: spm model is located at DATA\/raw-data\/\\[dataset\\]\/spm\\[vocab_num\\].model<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">++++++<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>Binarize<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">use fairseq to binarize data<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>binpath = Path('.\/DATA\/data-bin', mono_dataset_name)\nsrc_dict_file = '.\/DATA\/data-bin\/ted2020\/dict.en.txt'\ntgt_dict_file = src_dict_file\nmonopref = str(mono_prefix\/\"mono.tok\") # whatever filepath you get after applying subword tokenization\nif binpath.exists():\n    print(binpath, \"exists, will not overwrite!\")\nelse:\n    !python -m fairseq_cli.preprocess\\\n        --source-lang 'zh'\\\n        --target-lang 'en'\\\n        --trainpref {monopref}\\\n        --destdir {binpath}\\\n        --srcdict {src_dict_file}\\\n        --tgtdict {tgt_dict_file}\\\n        --workers 2<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>TODO: Generate synthetic data with backward model<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Add binarized monolingual data to the original data directory, and name it with &#8220;split_name&#8221;<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">ex. .\/DATA\/data-bin\/ted2020\/\\[split_name\\].zh-en.\\[&#8220;en&#8221;, &#8220;zh&#8221;\\].\\[&#8220;bin&#8221;, &#8220;idx&#8221;\\]<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">then you can use &#8216;generate_prediction(model, task, split=&#8221;split_name&#8221;)&#8217; to generate translation prediction<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># Add binarized monolingual data to the original data directory, and name it with \"split_name\"\n# ex. .\/DATA\/data-bin\/ted2020\/\\&#91;split_name\\].zh-en.\\&#91;\"en\", \"zh\"\\].\\&#91;\"bin\", \"idx\"\\]\n!cp .\/DATA\/data-bin\/mono\/train.zh-en.zh.bin .\/DATA\/data-bin\/ted2020\/mono.zh-en.zh.bin\n!cp .\/DATA\/data-bin\/mono\/train.zh-en.zh.idx .\/DATA\/data-bin\/ted2020\/mono.zh-en.zh.idx\n!cp .\/DATA\/data-bin\/mono\/train.zh-en.en.bin .\/DATA\/data-bin\/ted2020\/mono.zh-en.en.bin\n!cp .\/DATA\/data-bin\/mono\/train.zh-en.en.idx .\/DATA\/data-bin\/ted2020\/mono.zh-en.en.idx<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code># hint: do prediction on split='mono' to create prediction_file\n# generate_prediction( ... ,split=... ,outfile=... )<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>TODO: Create new dataset<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">1. Combine the prediction data with monolingual data<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">2. Use the original spm model to tokenize data into Subword Units<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">3. Binarize data with fairseq<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code># Combine prediction_file (.en) and mono.zh (.zh) into a new dataset.\n# \n# hint: tokenize prediction_file with the spm model\n# spm_model.encode(line, out_type=str)\n# output: .\/DATA\/rawdata\/mono\/mono.tok.en &amp; mono.tok.zh\n#\n# hint: use fairseq to binarize these two files again\n# binpath = Path('.\/DATA\/data-bin\/synthetic')\n# src_dict_file = '.\/DATA\/data-bin\/ted2020\/dict.en.txt'\n# tgt_dict_file = src_dict_file\n# monopref = .\/DATA\/rawdata\/mono\/mono.tok # or whatever path after applying subword tokenization, w\/o the suffix (.zh\/.en)\n# if binpath.exists():\n#     print(binpath, \"exists, will not overwrite!\")\n# else:\n#     !python -m fairseq_cli.preprocess\\\n#         --source-lang 'zh'\\\n#         --target-lang 'en'\\\n#         --trainpref {monopref}\\\n#         --destdir {binpath}\\\n#         --srcdict {src_dict_file}\\\n#         --tgtdict {tgt_dict_file}\\\n#         --workers 2<\/code><\/pre>\n\n\n\n<pre class=\"wp-block-code\"><code># create a new dataset from all the files prepared above\n!cp -r .\/DATA\/data-bin\/ted2020\/ .\/DATA\/data-bin\/ted2020_with_mono\/\n\n!cp .\/DATA\/data-bin\/synthetic\/train.zh-en.zh.bin .\/DATA\/data-bin\/ted2020_with_mono\/train1.en-zh.zh.bin\n!cp .\/DATA\/data-bin\/synthetic\/train.zh-en.zh.idx .\/DATA\/data-bin\/ted2020_with_mono\/train1.en-zh.zh.idx\n!cp .\/DATA\/data-bin\/synthetic\/train.zh-en.en.bin .\/DATA\/data-bin\/ted2020_with_mono\/train1.en-zh.en.bin\n!cp .\/DATA\/data-bin\/synthetic\/train.zh-en.en.idx .\/DATA\/data-bin\/ted2020_with_mono\/train1.en-zh.en.idx<\/code><\/pre>\n\n\n\n<p class=\"wp-block-paragraph\">Created new dataset &#8220;ted2020_with_mono&#8221;<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">1. Change the datadir in <strong>**config**<\/strong> (&#8220;.\/DATA\/data-bin\/ted2020_with_mono&#8221;)<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">2. Switch back the source_lang and target_lang in <strong>**config**<\/strong> (&#8220;en&#8221;, &#8220;zh&#8221;)<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">2. Change the savedir in <strong>**config**<\/strong> (eg. &#8220;.\/checkpoints\/transformer-bt&#8221;)<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">3. Train model<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>References<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">1. &lt;a name=ott2019fairseq&gt;&lt;\/a&gt;Ott, M., Edunov, S., Baevski, A., Fan, A., Gross, S., Ng, N., &#8230; &amp; Auli, M. (2019, June). fairseq: A Fast, Extensible Toolkit for Sequence Modeling. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics (Demonstrations) (pp. 48-53).<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">2. &lt;a name=vaswani2017&gt;&lt;\/a&gt;Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., &#8230; &amp; Polosukhin, I. (2017, December). Attention is all you need. In Proceedings of the 31st International Conference on Neural Information Processing Systems (pp. 6000-6010).<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">3. &lt;a name=reimers-2020-multilingual-sentence-bert&gt;&lt;\/a&gt;Reimers, N., &amp; Gurevych, I. (2020, November). Making Monolingual Sentence Embeddings Multilingual Using Knowledge Distillation. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP) (pp. 4512-4525).<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">4. &lt;a name=tiedemann2012parallel&gt;&lt;\/a&gt;Tiedemann, J. (2012, May). Parallel Data, Tools and Interfaces in OPUS. In Lrec (Vol. 2012, pp. 2214-2218).<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">5. &lt;a name=kudo-richardson-2018-sentencepiece&gt;&lt;\/a&gt;Kudo, T., &amp; Richardson, J. (2018, November). SentencePiece: A simple and language independent subword tokenizer and detokenizer for Neural Text Processing. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations (pp. 66-71).<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">6. &lt;a name=sennrich-etal-2016-improving&gt;&lt;\/a&gt;Sennrich, R., Haddow, B., &amp; Birch, A. (2016, August). Improving Neural Machine Translation Models with Monolingual Data. In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers) (pp. 86-96).<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">7. &lt;a name=edunov-etal-2018-understanding&gt;&lt;\/a&gt;Edunov, S., Ott, M., Auli, M., &amp; Grangier, D. (2018). Understanding Back-Translation at Scale. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (pp. 489-500).<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">8. https:\/\/github.com\/ajinkyakulkarni14\/TED-Multilingual-Parallel-Corpus<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">9. https:\/\/ithelp.ithome.com.tw\/articles\/10233122<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">10. https:\/\/nlp.seas.harvard.edu\/2018\/04\/03\/attention.html<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">11. https:\/\/colab.research.google.com\/github\/ga642381\/ML2021-Spring\/blob\/main\/HW05\/HW05.ipynb<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">++++++<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u3010HW5\u3011Transformer0.0\u674e\u5b8f\u6bc52021\/2022\u6625\u673a\u5668\u5b66\u4e60\u8bfe\u7a0b\u7b14\u8bb0EP15(P54-P57) \u4ece [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[6],"tags":[15,3,7,9,8],"class_list":["post-551","post","type-post","status-publish","format-standard","hentry","category-lhyjqxxbj","tag-homework","tag-xxbj","tag-jjxx","tag-lhy","tag-deeplearning"],"_links":{"self":[{"href":"https:\/\/tobykskgd.life\/index.php\/wp-json\/wp\/v2\/posts\/551","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/tobykskgd.life\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/tobykskgd.life\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/tobykskgd.life\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/tobykskgd.life\/index.php\/wp-json\/wp\/v2\/comments?post=551"}],"version-history":[{"count":2,"href":"https:\/\/tobykskgd.life\/index.php\/wp-json\/wp\/v2\/posts\/551\/revisions"}],"predecessor-version":[{"id":1878,"href":"https:\/\/tobykskgd.life\/index.php\/wp-json\/wp\/v2\/posts\/551\/revisions\/1878"}],"wp:attachment":[{"href":"https:\/\/tobykskgd.life\/index.php\/wp-json\/wp\/v2\/media?parent=551"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/tobykskgd.life\/index.php\/wp-json\/wp\/v2\/categories?post=551"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/tobykskgd.life\/index.php\/wp-json\/wp\/v2\/tags?post=551"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}