Update README.md
Browse files
README.md
CHANGED
|
@@ -31,3 +31,56 @@ Please cite the original paper if you decide to use the model:
|
|
| 31 |
|
| 32 |
`awesome-align` is a tool that can extract word alignments from multilingual BERT (mBERT) [Demo](https://colab.research.google.com/drive/1205ubqebM0OsZa1nRgbGJBtitgHqIVv6?usp=sharing) and allows you to fine-tune mBERT on parallel corpora for better alignment quality (see our paper for more details).
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
`awesome-align` is a tool that can extract word alignments from multilingual BERT (mBERT) [Demo](https://colab.research.google.com/drive/1205ubqebM0OsZa1nRgbGJBtitgHqIVv6?usp=sharing) and allows you to fine-tune mBERT on parallel corpora for better alignment quality (see our paper for more details).
|
| 33 |
|
| 34 |
+
## Usage
|
| 35 |
+
|
| 36 |
+
```python
|
| 37 |
+
from transformers import AutoModel, AutoTokenizer
|
| 38 |
+
import itertools
|
| 39 |
+
import torch
|
| 40 |
+
|
| 41 |
+
# load model
|
| 42 |
+
model = AutoModel.from_pretrained("aneuraz/awesome-align-with-co")
|
| 43 |
+
tokenizer = AutoTokenizer.from_pretrained("aneuraz/awesome-align-with-co")
|
| 44 |
+
|
| 45 |
+
# model parameters
|
| 46 |
+
align_layer = 8
|
| 47 |
+
threshold = 1e-3
|
| 48 |
+
|
| 49 |
+
# define inputs
|
| 50 |
+
src = 'awesome-align is awesome !'
|
| 51 |
+
tgt = '牛对齐 是 牛 !'
|
| 52 |
+
|
| 53 |
+
# pre-processing
|
| 54 |
+
sent_src, sent_tgt = src.strip().split(), tgt.strip().split()
|
| 55 |
+
token_src, token_tgt = [tokenizer.tokenize(word) for word in sent_src], [tokenizer.tokenize(word) for word in sent_tgt]
|
| 56 |
+
wid_src, wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_src], [tokenizer.convert_tokens_to_ids(x) for x in token_tgt]
|
| 57 |
+
ids_src, ids_tgt = tokenizer.prepare_for_model(list(itertools.chain(*wid_src)), return_tensors='pt', model_max_length=tokenizer.model_max_length, truncation=True)['input_ids'], tokenizer.prepare_for_model(list(itertools.chain(*wid_tgt)), return_tensors='pt', truncation=True, model_max_length=tokenizer.model_max_length)['input_ids']
|
| 58 |
+
sub2word_map_src = []
|
| 59 |
+
for i, word_list in enumerate(token_src):
|
| 60 |
+
sub2word_map_src += [i for x in word_list]
|
| 61 |
+
sub2word_map_tgt = []
|
| 62 |
+
for i, word_list in enumerate(token_tgt):
|
| 63 |
+
sub2word_map_tgt += [i for x in word_list]
|
| 64 |
+
|
| 65 |
+
# alignment
|
| 66 |
+
align_layer = 8
|
| 67 |
+
threshold = 1e-3
|
| 68 |
+
model.eval()
|
| 69 |
+
with torch.no_grad():
|
| 70 |
+
out_src = model(ids_src.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]
|
| 71 |
+
out_tgt = model(ids_tgt.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]
|
| 72 |
+
|
| 73 |
+
dot_prod = torch.matmul(out_src, out_tgt.transpose(-1, -2))
|
| 74 |
+
|
| 75 |
+
softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod)
|
| 76 |
+
softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod)
|
| 77 |
+
|
| 78 |
+
softmax_inter = (softmax_srctgt > threshold)*(softmax_tgtsrc > threshold)
|
| 79 |
+
|
| 80 |
+
align_subwords = torch.nonzero(softmax_inter, as_tuple=False)
|
| 81 |
+
align_words = set()
|
| 82 |
+
for i, j in align_subwords:
|
| 83 |
+
align_words.add( (sub2word_map_src[i], sub2word_map_tgt[j]) )
|
| 84 |
+
|
| 85 |
+
print(align_words
|
| 86 |
+
```
|