| nohup: ignoring input | |
| sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : | |
| trainer_spec { | |
| input: /home/jknafou/Corpus_translation/translated_corpus/cat_title_abstract_10M.txt | |
| input_format: | |
| model_prefix: /home/jknafou/Language_model_training/French/Tokenizer/bio_french | |
| model_type: UNIGRAM | |
| vocab_size: 32000 | |
| self_test_sample_size: 0 | |
| character_coverage: 0.9995 | |
| input_sentence_size: 0 | |
| shuffle_input_sentence: 1 | |
| seed_sentencepiece_size: 1000000 | |
| shrinking_factor: 0.75 | |
| max_sentence_length: 4192 | |
| num_threads: 16 | |
| num_sub_iterations: 2 | |
| max_sentencepiece_length: 16 | |
| split_by_unicode_script: 1 | |
| split_by_number: 1 | |
| split_by_whitespace: 1 | |
| split_digits: 0 | |
| treat_whitespace_as_suffix: 0 | |
| required_chars: | |
| byte_fallback: 0 | |
| vocabulary_output_piece_score: 1 | |
| train_extremely_large_corpus: 1 | |
| hard_vocab_limit: 1 | |
| use_all_vocab: 0 | |
| unk_id: 0 | |
| bos_id: 1 | |
| eos_id: 2 | |
| pad_id: -1 | |
| unk_piece: <unk> | |
| bos_piece: <s> | |
| eos_piece: </s> | |
| pad_piece: <pad> | |
| unk_surface: ⁇ | |
| } | |
| normalizer_spec { | |
| name: nmt_nfkc | |
| add_dummy_prefix: 1 | |
| remove_extra_whitespaces: 1 | |
| escape_whitespaces: 1 | |
| normalization_rule_tsv: | |
| } | |
| denormalizer_spec {} | |
| trainer_interface.cc(319) LOG(INFO) SentenceIterator is not specified. Using MultiFileSentenceIterator. | |
| trainer_interface.cc(174) LOG(INFO) Loading corpus: /home/jknafou/Corpus_translation/translated_corpus/cat_title_abstract_10M.txt | |
| trainer_interface.cc(346) LOG(WARNING) Found too long line (7312 > 4192). | |
| trainer_interface.cc(348) LOG(WARNING) Too long lines are skipped in the training. | |
| trainer_interface.cc(349) LOG(WARNING) The maximum length can be changed with --max_sentence_length=<size> flag. | |
| trainer_interface.cc(136) LOG(INFO) Loaded 1000000 lines | |
| trainer_interface.cc(136) LOG(INFO) Loaded 2000000 lines | |
| trainer_interface.cc(136) LOG(INFO) Loaded 3000000 lines | |
| trainer_interface.cc(136) LOG(INFO) Loaded 4000000 lines | |
| trainer_interface.cc(136) LOG(INFO) Loaded 5000000 lines | |
| trainer_interface.cc(136) LOG(INFO) Loaded 6000000 lines | |
| trainer_interface.cc(136) LOG(INFO) Loaded 7000000 lines | |
| trainer_interface.cc(136) LOG(INFO) Loaded 8000000 lines | |
| trainer_interface.cc(136) LOG(INFO) Loaded 9000000 lines | |
| trainer_interface.cc(113) LOG(WARNING) Too many sentences are loaded! (9978301), which may slow down training. | |
| trainer_interface.cc(115) LOG(WARNING) Consider using --input_sentence_size=<size> and --shuffle_input_sentence=true. | |
| trainer_interface.cc(118) LOG(WARNING) They allow to randomly sample <size> sentences from the entire corpus. | |
| trainer_interface.cc(375) LOG(INFO) Loaded all 9978301 sentences | |
| trainer_interface.cc(381) LOG(INFO) Skipped 21699 too long sentences. | |
| trainer_interface.cc(390) LOG(INFO) Adding meta_piece: <unk> | |
| trainer_interface.cc(390) LOG(INFO) Adding meta_piece: <s> | |
| trainer_interface.cc(390) LOG(INFO) Adding meta_piece: </s> | |
| trainer_interface.cc(395) LOG(INFO) Normalizing sentences... | |
| trainer_interface.cc(456) LOG(INFO) all chars count=16138226709 | |
| trainer_interface.cc(467) LOG(INFO) Done: 99.9527% characters are covered. | |
| trainer_interface.cc(477) LOG(INFO) Alphabet size=95 | |
| trainer_interface.cc(478) LOG(INFO) Final character coverage=0.999527 | |
| trainer_interface.cc(510) LOG(INFO) Done! preprocessed 9978301 sentences. | |
| tcmalloc: large alloc 1073741824 bytes == 0x561532c8c000 @ | |
| tcmalloc: large alloc 2147483648 bytes == 0x561572c8c000 @ | |
| tcmalloc: large alloc 4294967296 bytes == 0x5615f3508000 @ | |
| tcmalloc: large alloc 8589934592 bytes == 0x5616f3508000 @ | |
| tcmalloc: large alloc 17179869184 bytes == 0x5618f3d08000 @ | |
| tcmalloc: large alloc 34359738368 bytes == 0x561cf4d08000 @ | |
| tcmalloc: large alloc 68719476736 bytes == 0x5624f6d08000 @ | |
| tcmalloc: large alloc 129105821696 bytes == 0x5634fad08000 @ | |
| tcmalloc: large alloc 129105821696 bytes == 0x5653119f0000 @ | |
| tcmalloc: large alloc 129105821696 bytes == 0x5671286d8000 @ | |
| tcmalloc: large alloc 129105821696 bytes == 0x568f3f3c0000 @ | |
| unigram_model_trainer.cc(138) LOG(INFO) Making suffix array... | |
| unigram_model_trainer.cc(142) LOG(INFO) Extracting frequent sub strings... | |
| unigram_model_trainer.cc(193) LOG(INFO) Initialized 1000000 seed sentencepieces | |
| trainer_interface.cc(516) LOG(INFO) Tokenizing input sentences with whitespace: 9978301 | |
| trainer_interface.cc(526) LOG(INFO) Done! 19224817 | |
| unigram_model_trainer.cc(488) LOG(INFO) Using 19224817 sentences for EM training | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=739481 obj=12.0696 num_tokens=74362684 num_tokens/piece=100.561 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=599525 obj=9.09794 num_tokens=74303077 num_tokens/piece=123.937 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=449629 obj=9.06759 num_tokens=74685579 num_tokens/piece=166.105 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=449389 obj=9.06467 num_tokens=74741532 num_tokens/piece=166.318 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=337040 obj=9.05777 num_tokens=75208218 num_tokens/piece=223.143 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=337037 obj=9.06272 num_tokens=75221866 num_tokens/piece=223.186 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=252777 obj=9.05783 num_tokens=76327358 num_tokens/piece=301.955 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=252776 obj=9.06287 num_tokens=76319490 num_tokens/piece=301.925 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=189582 obj=9.07482 num_tokens=78333478 num_tokens/piece=413.19 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=189582 obj=9.0738 num_tokens=78313493 num_tokens/piece=413.085 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=142186 obj=9.10085 num_tokens=80614240 num_tokens/piece=566.963 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=142186 obj=9.09887 num_tokens=80596413 num_tokens/piece=566.838 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=106639 obj=9.13963 num_tokens=83102337 num_tokens/piece=779.287 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=106639 obj=9.12951 num_tokens=83086218 num_tokens/piece=779.135 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=79979 obj=9.19014 num_tokens=85866144 num_tokens/piece=1073.61 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=79979 obj=9.18657 num_tokens=85860214 num_tokens/piece=1073.53 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=59984 obj=9.25486 num_tokens=88825440 num_tokens/piece=1480.82 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=59984 obj=9.23964 num_tokens=88813706 num_tokens/piece=1480.62 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=44988 obj=9.33847 num_tokens=92202568 num_tokens/piece=2049.49 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=44988 obj=9.32441 num_tokens=92203259 num_tokens/piece=2049.51 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=35200 obj=9.4222 num_tokens=95280571 num_tokens/piece=2706.83 | |
| unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=35200 obj=9.40217 num_tokens=95275736 num_tokens/piece=2706.7 | |
| trainer_interface.cc(604) LOG(INFO) Saving model: /home/jknafou/Language_model_training/French/Tokenizer/bio_french.model | |
| trainer_interface.cc(615) LOG(INFO) Saving vocabs: /home/jknafou/Language_model_training/French/Tokenizer/bio_french.vocab | |