Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks
Paper
•
1908.10084
•
Published
•
9
This is a sentence-transformers model finetuned from microsoft/unixcoder-base-unimodal on the soco_train_java dataset. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
SentenceTransformer(
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)
First install the Sentence Transformers library:
pip install -U sentence-transformers
Then you can load this model and run inference.
from sentence_transformers import SentenceTransformer
# Download from the 🤗 Hub
model = SentenceTransformer("buelfhood/SOCO-Java-UnixCoder-Softmax-PairClass-VAST-e50")
# Run inference
sentences = [
'\n\nimport java.awt.*;\nimport java.awt.event.*;\nimport java.io.*;\nimport java.net.*;\n\npublic class BruteForce extends Frame implements ActionListener {\n\tprivate TextField tf = new TextField();\n private TextArea ta = new TextArea();\n\n \tpublic void actionPerformed (ActionEvent e) {\n\t\tString s = tf.getText();\n\t\tString login="";\n\n\t\tif (s.length() != 0)\n\t\t{\n\t\t\tchar symbol = \'A\';\n\n\t\t\tlogin=":";\n\t\t\t\n\t\t\tfor(int i = 0; i < 3; i++)\n\t\t\t{\n\t\t\t\tsymbol = (char)(57.0 * Math.random() + 65);\n\n\t\t\t\tif(symbol>90 && symbol<97){\n\t\t\t\t\ti--;\n\t\t\t\t\tcontinue;\n\t\t\t\t}\n\t\t\t\tlogin=login+symbol;\n\n\t\t\t}\n\n\t\t ta.setText (fetchURL (s,login));\n\t\t System.out.println("runing"+login);\n\t\t}while(ta.getText().compareTo("Invalid URL")!=0 || ta.getText().compareTo("Error URL")!=0);\n\n\t\tSystem.out.println("The password is: "+login);\n\t}\n\n\tpublic BruteForce() {\n\t\tsuper ("SEC-CRACK");\n\n\t \n\t add (tf, BorderLayout.LEFT);\n\t ta.setEditable(false);\n\t add (ta, BorderLayout.CENTER);\n\t tf.addActionListener (this);\n\n\t addWindowListener (new WindowAdapter() {\n\t public void windowClosing (WindowEvent e) {\n\t dispose();\n\t System.exit(0);\n\t }\n\t });\n\t}\n\n\tprivate String fetchURL (String urlString,String login) {\n\t\tStringWriter sw = new StringWriter();\n\t PrintWriter pw = new PrintWriter();\n\n\t try {\n\t URL url = new URL (urlString);\n\n\t \n\t\n\t \n\n\t \n\t String encoding = new url.misc.BASE64Encoder().encode (login.getBytes());\n\n\t \n\t URLConnection uc = url.openConnection();\n\t uc.setRequestProperty ("Authorization", " " + encoding);\n\t InputStream content = (InputStream)uc.getInputStream();\n\t BufferedReader in =\n\t new BufferedReader (new InputStreamReader (content));\n\t String line;\n\t while ((line = in.readLine()) != null) {\n\t pw.println (line);\n\t }\n\t } catch (MalformedURLException e) {\n\t pw.println ("Invalid URL");\n\t } catch (IOException e) {\n\t pw.println ("Error URL");\n\t }\n\t return sw.toString();\n\t}\n\n\n\tpublic static void main(String args[]) {\n\t\tFrame f = new BruteForce();\n\t f.setSize(300, 300);\n\t f.setVisible (true);\n\t}\n}\n\n\n\n\nclass Base64Converter\n{\n\tpublic static final char [ ] alphabet = {\n \'A\', \'B\', \'C\', \'D\', \'E\', \'F\', \'G\', \'H\', \n \'I\', \'J\', \'K\', \'L\', \'M\', \'N\', \'O\', \'P\', \n \'Q\', \'R\', \'S\', \'T\', \'U\', \'V\', \'W\', \'X\', \n \'Y\', \'Z\', \'a\', \'b\', \'c\', \'d\', \'e\', \'f\', \n \'g\', \'h\', \'i\', \'j\', \'k\', \'l\', \'m\', \'n\', \n \'o\', \'p\', \'q\', \'r\', \'s\', \'t\', \'u\', \'v\', \n \'w\', \'x\', \'y\', \'z\', \'0\', \'1\', \'2\', \'3\', \n \'4\', \'5\', \'6\', \'7\', \'8\', \'9\', \'+\', \'/\' }; \n\n\n public static String encode ( String s )\n {\n return encode ( s.getBytes ( ) );\n }\n\n public static String encode ( byte [ ] octetString )\n {\n int bits24;\n int bits6;\n\n char [ ] out\n = new char [ ( ( octetString.length - 1 ) / 3 + 1 ) * 4 ];\n\n int outIndex = 0;\n int i = 0;\n\n while ( ( i + 3 ) <= octetString.length )\n {\n \n bits24 = ( octetString [ i++ ] & 0xFF ) << 16;\n bits24 |= ( octetString [ i++ ] & 0xFF ) << 8;\n bits24 |= ( octetString [ i++ ] & 0xFF ) << 0;\n\n bits6 = ( bits24 & 0x00FC0000 ) >> 18;\n out [ outIndex++ ] = alphabet [ bits6 ];\n bits6 = ( bits24 & 0x0003F000 ) >> 12;\n out [ outIndex++ ] = alphabet [ bits6 ];\n bits6 = ( bits24 & 0x00000FC0 ) >> 6;\n out [ outIndex++ ] = alphabet [ bits6 ];\n bits6 = ( bits24 & 0x0000003F );\n out [ outIndex++ ] = alphabet [ bits6 ];\n }\n\n if ( octetString.length - i == 2 )\n {\n \n bits24 = ( octetString [ i ] & 0xFF ) << 16;\n bits24 |= ( octetString [ i + 1 ] & 0xFF ) << 8;\n\n bits6 = ( bits24 & 0x00FC0000 ) >> 18;\n out [ outIndex++ ] = alphabet [ bits6 ];\n bits6 = ( bits24 & 0x0003F000 ) >> 12;\n out [ outIndex++ ] = alphabet [ bits6 ];\n bits6 = ( bits24 & 0x00000FC0 ) >> 6;\n out [ outIndex++ ] = alphabet [ bits6 ];\n\n \n out [ outIndex++ ] = \'=\';\n }\n else if ( octetString.length - i == 1 )\n {\n \n bits24 = ( octetString [ i ] & 0xFF ) << 16;\n\n bits6 = ( bits24 & 0x00FC0000 ) >> 18;\n out [ outIndex++ ] = alphabet [ bits6 ];\n bits6 = ( bits24 & 0x0003F000 ) >> 12;\n out [ outIndex++ ] = alphabet [ bits6 ];\n\n \n out [ outIndex++ ] = \'=\';\n out [ outIndex++ ] = \'=\';\n }\n\n return new String ( out );\n }\n}\n\n',
'\n\nimport java.io.*;\nimport java.text.*;\nimport java.util.*;\nimport java.net.*;\n\npublic class BruteForce extends Thread\n{\n private static final String USERNAME = "";\n private static final char [] POSSIBLE_CHAR =\n {\'a\', \'b\', \'c\', \'d\', \'e\', \'f\', \'g\', \'h\', \'i\', \'j\', \'k\', \'l\', \'m\',\n \'n\', \'o\', \'p\', \'q\', \'r\', \'s\', \'t\', \'u\', \'v\', \'w\', \'x\', \'y\', \'z\',\n \'A\', \'B\', \'C\', \'D\', \'E\', \'F\', \'G\', \'H\', \'I\', \'J\', \'K\', \'L\', \'M\',\n \'N\', \'O\', \'P\', \'Q\', \'R\', \'S\', \'T\', \'U\', \'V\', \'W\', \'X\', \'Y\', \'Z\'};\n private static int NUMBER_OF_THREAD = 500;\n\n private static Date startDate = null;\n private static Date endDate = null;\n\n private String address;\n private String password;\n\n public BruteForce(String address, String password)\n {\n this.address = address;\n this.password = password;\n }\n\n public static void main(String[] args) throws IOException\n {\n if (args.length < 1)\n {\n System.err.println("Invalid usage!");\n System.err.println("Usage: java BruteForce <url>");\n System.exit(1);\n }\n\n try\n {\n brute(args[0], USERNAME);\n }\n catch(Exception e)\n {\n e.printStackTrace();\n System.exit(1);\n }\n }\n\n public static void brute(String address, String user)\n {\n BruteForce [] threads = new BruteForce[NUMBER_OF_THREAD];\n int index = 0;\n\n startDate = new Date();\n for(int i = 0; i < POSSIBLE_CHAR.length; i++)\n {\n for(int j = 0; j < POSSIBLE_CHAR.length; j++)\n {\n for(int k = 0; k < POSSIBLE_CHAR.length; k++)\n {\n String password = ""+POSSIBLE_CHAR[i]+POSSIBLE_CHAR[j]+\n POSSIBLE_CHAR[k];\n\n if (threads[index] != null && threads[index].isAlive())\n {\n try\n {\n threads[index].join();\n }\n catch(InterruptedException e ) {}\n }\n threads[index] = new BruteForce(address, password);\n threads[index].get();\n\n index = (index++) % threads.length;\n }\n }\n }\n }\n\n public void run()\n {\n if (endDate != null)\n return;\n\n try\n {\n\n URLConnection conn = (new URL(address)).openConnection();\n conn.setDoInput(true);\n\n if (login(conn, USERNAME, password))\n {\n endDate = new Date();\n System.out.println("Found the password: \\""+password+"\\"!");\n SimpleDateFormat format = new SimpleDateFormat("dd/MM/yyyy HH:mm:");\n System.out.println("Process started at: "+format.format(startDate));\n System.out.println("Process started at: "+format.format(endDate));\n double timeTaken = (double)(endDate.getTime()-startDate.getTime())/60000;\n System.out.println("Time taken: "+timeTaken+" minutes");\n System.exit(0);\n }\n else\n {\n System.out.println("Password: \\""+password+"\\" Failed!");\n return;\n }\n }\n catch(Exception e)\n {\n e.printStackTrace();\n }\n\n }\n\n public static boolean login(URLConnection conn, String user, String pass)\n {\n try\n {\n String encodeAuth = " "+Base64Encoder.encode(user+":"+pass);\n conn.setRequestProperty ("Authorization", encodeAuth);\n conn.connect();\n conn.getInputStream();\n }\n catch(Exception e)\n {\n return false;\n }\n return true;\n }\n}\n\n\n',
'\n\nimport java.net.*;\nimport java.io.*;\n\npublic class Base64Encoder\n{\n private final static char base64Array [] = {\n \'A\', \'B\', \'C\', \'D\', \'E\', \'F\', \'G\', \'H\',\n \'I\', \'J\', \'K\', \'L\', \'M\', \'N\', \'O\', \'P\',\n \'Q\', \'R\', \'S\', \'T\', \'U\', \'V\', \'W\', \'X\',\n \'Y\', \'Z\', \'a\', \'b\', \'c\', \'d\', \'e\', \'f\',\n \'g\', \'h\', \'i\', \'j\', \'k\', \'l\', \'m\', \'n\',\n \'o\', \'p\', \'q\', \'r\', \'s\', \'t\', \'u\', \'v\',\n \'w\', \'x\', \'y\', \'z\', \'0\', \'1\', \'2\', \'3\',\n \'4\', \'5\', \'6\', \'7\', \'8\', \'9\', \'+\', \'/\'\n };\n\n public static String encode (String string)\n {\n String encodedString = "";\n byte bytes [] = string.getBytes ();\n int i = 0;\n int pad = 0;\n while (i < bytes.length)\n {\n byte b1 = bytes [i++];\n byte b2;\n byte b3;\n if (i >= bytes.length)\n {\n b2 = 0;\n b3 = 0;\n pad = 2;\n }\n else\n {\n b2 = bytes [i++];\n if (i >= bytes.length)\n {\n b3 = 0;\n pad = 1;\n }\n else\n b3 = bytes [i++];\n }\n\n byte c1 = (byte)(b1 >> 2);\n byte c2 = (byte)(((b1 & 0x3) << 4) | (b2 >> 4));\n byte c3 = (byte)(((b2 & 0xf) << 2) | (b3 >> 6));\n byte c4 = (byte)(b3 & 0x3f);\n encodedString += base64Array [c1];\n encodedString += base64Array [c2];\n switch (pad)\n {\n case 0:\n encodedString += base64Array [c3];\n encodedString += base64Array [c4];\n break;\n case 1:\n encodedString += base64Array [c3];\n encodedString += "=";\n break;\n case 2:\n encodedString += "==";\n break;\n }\n }\n return encodedString;\n }\n}\n',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 768]
# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]
label, text_1, and text_2| label | text_1 | text_2 | |
|---|---|---|---|
| type | int | string | string |
| details |
|
|
|
| label | text_1 | text_2 |
|---|---|---|
0 |
|
import java.io.; |
0 |
|
|
0 |
|
import java.io.; |
SoftmaxLosslabel, text_1, and text_2| label | text_1 | text_2 | |
|---|---|---|---|
| type | int | string | string |
| details |
|
|
|
| label | text_1 | text_2 |
|---|---|---|
0 |
|
|
0 |
import java.io.; |
|
0 |
|
|
SoftmaxLosseval_strategy: stepsper_device_train_batch_size: 16per_device_eval_batch_size: 16num_train_epochs: 1fp16: Trueoverwrite_output_dir: Falsedo_predict: Falseeval_strategy: stepsprediction_loss_only: Trueper_device_train_batch_size: 16per_device_eval_batch_size: 16per_gpu_train_batch_size: Noneper_gpu_eval_batch_size: Nonegradient_accumulation_steps: 1eval_accumulation_steps: Nonetorch_empty_cache_steps: Nonelearning_rate: 5e-05weight_decay: 0.0adam_beta1: 0.9adam_beta2: 0.999adam_epsilon: 1e-08max_grad_norm: 1.0num_train_epochs: 1max_steps: -1lr_scheduler_type: linearlr_scheduler_kwargs: {}warmup_ratio: 0.0warmup_steps: 0log_level: passivelog_level_replica: warninglog_on_each_node: Truelogging_nan_inf_filter: Truesave_safetensors: Truesave_on_each_node: Falsesave_only_model: Falserestore_callback_states_from_checkpoint: Falseno_cuda: Falseuse_cpu: Falseuse_mps_device: Falseseed: 42data_seed: Nonejit_mode_eval: Falseuse_ipex: Falsebf16: Falsefp16: Truefp16_opt_level: O1half_precision_backend: autobf16_full_eval: Falsefp16_full_eval: Falsetf32: Nonelocal_rank: 0ddp_backend: Nonetpu_num_cores: Nonetpu_metrics_debug: Falsedebug: []dataloader_drop_last: Falsedataloader_num_workers: 0dataloader_prefetch_factor: Nonepast_index: -1disable_tqdm: Falseremove_unused_columns: Truelabel_names: Noneload_best_model_at_end: Falseignore_data_skip: Falsefsdp: []fsdp_min_num_params: 0fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}fsdp_transformer_layer_cls_to_wrap: Noneaccelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}deepspeed: Nonelabel_smoothing_factor: 0.0optim: adamw_torchoptim_args: Noneadafactor: Falsegroup_by_length: Falselength_column_name: lengthddp_find_unused_parameters: Noneddp_bucket_cap_mb: Noneddp_broadcast_buffers: Falsedataloader_pin_memory: Truedataloader_persistent_workers: Falseskip_memory_metrics: Trueuse_legacy_prediction_loop: Falsepush_to_hub: Falseresume_from_checkpoint: Nonehub_model_id: Nonehub_strategy: every_savehub_private_repo: Nonehub_always_push: Falsegradient_checkpointing: Falsegradient_checkpointing_kwargs: Noneinclude_inputs_for_metrics: Falseinclude_for_metrics: []eval_do_concat_batches: Truefp16_backend: autopush_to_hub_model_id: Nonepush_to_hub_organization: Nonemp_parameters: auto_find_batch_size: Falsefull_determinism: Falsetorchdynamo: Noneray_scope: lastddp_timeout: 1800torch_compile: Falsetorch_compile_backend: Nonetorch_compile_mode: Noneinclude_tokens_per_second: Falseinclude_num_input_tokens_seen: Falseneftune_noise_alpha: Noneoptim_target_modules: Nonebatch_eval_metrics: Falseeval_on_start: Falseuse_liger_kernel: Falseeval_use_gather_object: Falseaverage_tokens_across_devices: Falseprompts: Nonebatch_sampler: batch_samplermulti_dataset_batch_sampler: proportional| Epoch | Step | Training Loss | Validation Loss |
|---|---|---|---|
| 0.0957 | 100 | 0.0506 | 0.0198 |
| 0.1914 | 200 | 0.0159 | 0.0178 |
| 0.2871 | 300 | 0.0267 | 0.0195 |
| 0.3828 | 400 | 0.0341 | 0.0202 |
| 0.4785 | 500 | 0.0139 | 0.0161 |
| 0.5742 | 600 | 0.0142 | 0.0157 |
| 0.6699 | 700 | 0.0244 | 0.0154 |
| 0.7656 | 800 | 0.018 | 0.0152 |
| 0.8612 | 900 | 0.0088 | 0.0153 |
| 0.9569 | 1000 | 0.0205 | 0.0146 |
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
Base model
microsoft/unixcoder-base-unimodal