TransBERT-bio-fr / french_log_final.out

Julien Knafou

Upload folder using huggingface_hub

6e304af verified 7 months ago

7.29 kB

	nohup: ignoring input
	sentencepiece_trainer.cc(77) LOG(INFO) Starts training with :
	trainer_spec {
	input: /home/jknafou/Corpus_translation/translated_corpus/cat_title_abstract_10M.txt
	input_format:
	model_prefix: /home/jknafou/Language_model_training/French/Tokenizer/bio_french
	model_type: UNIGRAM
	vocab_size: 32000
	self_test_sample_size: 0
	character_coverage: 0.9995
	input_sentence_size: 0
	shuffle_input_sentence: 1
	seed_sentencepiece_size: 1000000
	shrinking_factor: 0.75
	max_sentence_length: 4192
	num_threads: 16
	num_sub_iterations: 2
	max_sentencepiece_length: 16
	split_by_unicode_script: 1
	split_by_number: 1
	split_by_whitespace: 1
	split_digits: 0
	treat_whitespace_as_suffix: 0
	required_chars:
	byte_fallback: 0
	vocabulary_output_piece_score: 1
	train_extremely_large_corpus: 1
	hard_vocab_limit: 1
	use_all_vocab: 0
	unk_id: 0
	bos_id: 1
	eos_id: 2
	pad_id: -1
	unk_piece: <unk>
	bos_piece: <s>
	eos_piece: </s>
	pad_piece: <pad>
	unk_surface: ⁇
	}
	normalizer_spec {
	name: nmt_nfkc
	add_dummy_prefix: 1
	remove_extra_whitespaces: 1
	escape_whitespaces: 1
	normalization_rule_tsv:
	}
	denormalizer_spec {}
	trainer_interface.cc(319) LOG(INFO) SentenceIterator is not specified. Using MultiFileSentenceIterator.
	trainer_interface.cc(174) LOG(INFO) Loading corpus: /home/jknafou/Corpus_translation/translated_corpus/cat_title_abstract_10M.txt
	trainer_interface.cc(346) LOG(WARNING) Found too long line (7312 > 4192).
	trainer_interface.cc(348) LOG(WARNING) Too long lines are skipped in the training.
	trainer_interface.cc(349) LOG(WARNING) The maximum length can be changed with --max_sentence_length=<size> flag.
	trainer_interface.cc(136) LOG(INFO) Loaded 1000000 lines
	trainer_interface.cc(136) LOG(INFO) Loaded 2000000 lines
	trainer_interface.cc(136) LOG(INFO) Loaded 3000000 lines
	trainer_interface.cc(136) LOG(INFO) Loaded 4000000 lines
	trainer_interface.cc(136) LOG(INFO) Loaded 5000000 lines
	trainer_interface.cc(136) LOG(INFO) Loaded 6000000 lines
	trainer_interface.cc(136) LOG(INFO) Loaded 7000000 lines
	trainer_interface.cc(136) LOG(INFO) Loaded 8000000 lines
	trainer_interface.cc(136) LOG(INFO) Loaded 9000000 lines
	trainer_interface.cc(113) LOG(WARNING) Too many sentences are loaded! (9978301), which may slow down training.
	trainer_interface.cc(115) LOG(WARNING) Consider using --input_sentence_size=<size> and --shuffle_input_sentence=true.
	trainer_interface.cc(118) LOG(WARNING) They allow to randomly sample <size> sentences from the entire corpus.
	trainer_interface.cc(375) LOG(INFO) Loaded all 9978301 sentences
	trainer_interface.cc(381) LOG(INFO) Skipped 21699 too long sentences.
	trainer_interface.cc(390) LOG(INFO) Adding meta_piece: <unk>
	trainer_interface.cc(390) LOG(INFO) Adding meta_piece: <s>
	trainer_interface.cc(390) LOG(INFO) Adding meta_piece: </s>
	trainer_interface.cc(395) LOG(INFO) Normalizing sentences...
	trainer_interface.cc(456) LOG(INFO) all chars count=16138226709
	trainer_interface.cc(467) LOG(INFO) Done: 99.9527% characters are covered.
	trainer_interface.cc(477) LOG(INFO) Alphabet size=95
	trainer_interface.cc(478) LOG(INFO) Final character coverage=0.999527
	trainer_interface.cc(510) LOG(INFO) Done! preprocessed 9978301 sentences.
	tcmalloc: large alloc 1073741824 bytes == 0x561532c8c000 @
	tcmalloc: large alloc 2147483648 bytes == 0x561572c8c000 @
	tcmalloc: large alloc 4294967296 bytes == 0x5615f3508000 @
	tcmalloc: large alloc 8589934592 bytes == 0x5616f3508000 @
	tcmalloc: large alloc 17179869184 bytes == 0x5618f3d08000 @
	tcmalloc: large alloc 34359738368 bytes == 0x561cf4d08000 @
	tcmalloc: large alloc 68719476736 bytes == 0x5624f6d08000 @
	tcmalloc: large alloc 129105821696 bytes == 0x5634fad08000 @
	tcmalloc: large alloc 129105821696 bytes == 0x5653119f0000 @
	tcmalloc: large alloc 129105821696 bytes == 0x5671286d8000 @
	tcmalloc: large alloc 129105821696 bytes == 0x568f3f3c0000 @
	unigram_model_trainer.cc(138) LOG(INFO) Making suffix array...
	unigram_model_trainer.cc(142) LOG(INFO) Extracting frequent sub strings...
	unigram_model_trainer.cc(193) LOG(INFO) Initialized 1000000 seed sentencepieces
	trainer_interface.cc(516) LOG(INFO) Tokenizing input sentences with whitespace: 9978301
	trainer_interface.cc(526) LOG(INFO) Done! 19224817
	unigram_model_trainer.cc(488) LOG(INFO) Using 19224817 sentences for EM training
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=739481 obj=12.0696 num_tokens=74362684 num_tokens/piece=100.561
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=599525 obj=9.09794 num_tokens=74303077 num_tokens/piece=123.937
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=449629 obj=9.06759 num_tokens=74685579 num_tokens/piece=166.105
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=449389 obj=9.06467 num_tokens=74741532 num_tokens/piece=166.318
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=337040 obj=9.05777 num_tokens=75208218 num_tokens/piece=223.143
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=337037 obj=9.06272 num_tokens=75221866 num_tokens/piece=223.186
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=252777 obj=9.05783 num_tokens=76327358 num_tokens/piece=301.955
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=252776 obj=9.06287 num_tokens=76319490 num_tokens/piece=301.925
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=189582 obj=9.07482 num_tokens=78333478 num_tokens/piece=413.19
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=189582 obj=9.0738 num_tokens=78313493 num_tokens/piece=413.085
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=142186 obj=9.10085 num_tokens=80614240 num_tokens/piece=566.963
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=142186 obj=9.09887 num_tokens=80596413 num_tokens/piece=566.838
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=106639 obj=9.13963 num_tokens=83102337 num_tokens/piece=779.287
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=106639 obj=9.12951 num_tokens=83086218 num_tokens/piece=779.135
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=79979 obj=9.19014 num_tokens=85866144 num_tokens/piece=1073.61
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=79979 obj=9.18657 num_tokens=85860214 num_tokens/piece=1073.53
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=59984 obj=9.25486 num_tokens=88825440 num_tokens/piece=1480.82
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=59984 obj=9.23964 num_tokens=88813706 num_tokens/piece=1480.62
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=44988 obj=9.33847 num_tokens=92202568 num_tokens/piece=2049.49
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=44988 obj=9.32441 num_tokens=92203259 num_tokens/piece=2049.51
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=0 size=35200 obj=9.4222 num_tokens=95280571 num_tokens/piece=2706.83
	unigram_model_trainer.cc(504) LOG(INFO) EM sub_iter=1 size=35200 obj=9.40217 num_tokens=95275736 num_tokens/piece=2706.7
	trainer_interface.cc(604) LOG(INFO) Saving model: /home/jknafou/Language_model_training/French/Tokenizer/bio_french.model
	trainer_interface.cc(615) LOG(INFO) Saving vocabs: /home/jknafou/Language_model_training/French/Tokenizer/bio_french.vocab