Denis202 commited on
Commit
595bbee
·
1 Parent(s): db33a58

Added Kiswahili training script and training data

Browse files
.gitattributes CHANGED
@@ -1,35 +1,5 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Git attributes
2
+ *.py text
3
+ *.txt text
4
+ *.md text
5
+ *.json text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
train_model.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import (
3
+ AutoModelForCausalLM,
4
+ AutoTokenizer,
5
+ TrainingArguments,
6
+ Trainer,
7
+ DataCollatorForLanguageModeling
8
+ )
9
+ from datasets import Dataset
10
+ import os
11
+ import glob
12
+
13
+ class KiswahiliTrainer:
14
+ def __init__(self, model_name="distilgpt2"):
15
+ self.model_name = model_name
16
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
17
+ self.tokenizer.pad_token = self.tokenizer.eos_token
18
+
19
+ self.model = AutoModelForCausalLM.from_pretrained(model_name)
20
+ print("✅ Model and tokenizer loaded successfully!")
21
+
22
+ def load_training_data(self, data_dir="./training_data"):
23
+ """Load all training data from text files"""
24
+ conversations = []
25
+
26
+ # Get all text files in the data directory
27
+ text_files = glob.glob(os.path.join(data_dir, "*.txt"))
28
+
29
+ for file_path in text_files:
30
+ print(f"📖 Loading data from: {file_path}")
31
+ try:
32
+ with open(file_path, 'r', encoding='utf-8') as f:
33
+ content = f.read().strip()
34
+
35
+ # Split by empty lines to get individual Q&A pairs
36
+ pairs = [p.strip() for p in content.split('\n\n') if p.strip()]
37
+
38
+ for pair in pairs:
39
+ if 'User:' in pair and 'Bot:' in pair:
40
+ conversations.append(pair)
41
+
42
+ except Exception as e:
43
+ print(f"❌ Error reading {file_path}: {e}")
44
+
45
+ print(f"📊 Loaded {len(conversations)} training examples")
46
+ return conversations
47
+
48
+ def prepare_dataset(self, conversations):
49
+ """Prepare the dataset for training"""
50
+ formatted_data = []
51
+
52
+ for conv in conversations:
53
+ # Clean and format the conversation
54
+ formatted_conv = conv.replace('Bot:', 'Assistant:')
55
+ formatted_data.append({"text": formatted_conv})
56
+
57
+ # Create dataset
58
+ dataset = Dataset.from_list(formatted_data)
59
+
60
+ # Tokenize the dataset
61
+ def tokenize_function(examples):
62
+ return self.tokenizer(
63
+ examples["text"],
64
+ truncation=True,
65
+ max_length=512,
66
+ padding="max_length"
67
+ )
68
+
69
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
70
+ return tokenized_dataset
71
+
72
+ def train(self, output_dir="./trained_model"):
73
+ """Train the model"""
74
+ # Load training data
75
+ conversations = self.load_training_data()
76
+
77
+ if not conversations:
78
+ print("❌ No training data found!")
79
+ return
80
+
81
+ # Prepare dataset
82
+ dataset = self.prepare_dataset(conversations)
83
+
84
+ # Set up training arguments
85
+ training_args = TrainingArguments(
86
+ output_dir=output_dir,
87
+ num_train_epochs=3,
88
+ per_device_train_batch_size=4,
89
+ save_steps=500,
90
+ save_total_limit=2,
91
+ prediction_loss_only=True,
92
+ logging_dir='./logs',
93
+ logging_steps=100,
94
+ learning_rate=5e-5,
95
+ warmup_steps=100,
96
+ )
97
+
98
+ # Data collator
99
+ data_collator = DataCollatorForLanguageModeling(
100
+ tokenizer=self.tokenizer,
101
+ mlm=False # We're doing causal LM, not masked LM
102
+ )
103
+
104
+ # Initialize Trainer
105
+ trainer = Trainer(
106
+ model=self.model,
107
+ args=training_args,
108
+ train_dataset=dataset,
109
+ data_collator=data_collator,
110
+ tokenizer=self.tokenizer,
111
+ )
112
+
113
+ # Start training
114
+ print("🚀 Starting training...")
115
+ trainer.train()
116
+
117
+ # Save the model
118
+ print("💾 Saving model...")
119
+ trainer.save_model()
120
+ self.tokenizer.save_pretrained(output_dir)
121
+ print(f"✅ Training complete! Model saved to {output_dir}")
122
+
123
+ # Run training
124
+ if __name__ == "__main__":
125
+ trainer = KiswahiliTrainer()
126
+ trainer.train()
training_data/grammar.txt ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ User: Unatumia "ni" na "si" vipi?
2
+ Bot: "Ni" hutumika kwa uhalisiano mzuri. Mfano: "Mimi ni mwalimu". "Si" hutumika kukana. Mfano: "Mimi si mwalimu".
3
+
4
+ User: Tofautisha kati ya "yeye" na "wewe"
5
+ Bot: "Wewe" ni kiwakilishi cha mtu wa pili (you). "Yeye" ni kiwakilishi cha mtu wa tatu (he/she).
6
+
7
+ User: Nini maana ya viwakilishi nafsi?
8
+ Bot: Viwakilishi nafsi ni maneno yanayowakilisha watu kwenye sentensi. Kuna: mimi, wewe, yeye, sisi, nyinyi, wao.
9
+
10
+ User: Je, "mtoto" ni ngeli gani?
11
+ Bot: "Mtoto" ni ngeli ya KI-VI kwa umoja na WA-WA kwa wingi: mtoto/watoto.
12
+
13
+ User: Toa mifano ya vitenzi vya kutendwa na kutendesha
14
+ Bot: Kutendwa: "Kitabu kimesomwa na Juma". Kutendesha: "Juma amekisoma kitabu".
15
+
16
+ User: Nini tofauti kati ya "liko" na "iko"?
17
+ Bot: "Liko" hutumika kwa vitu vya ngeli ya LI-YA. "Iko" hutumika kwa vitu vya ngeli ya I-ZI na N-ZI.
18
+
19
+ User: Vitenzi vya kupishana hutumikaje?
20
+ Bot: Vitenzi vya kupishana huonyesha vitendo vinavyofanyika kwa pande zote mbili. Mfano: "Juma na Asha wanapendana".
21
+
22
+ User: Nini maana ya kauli ya kutendwa?
23
+ Bot: Kauli ya kutendwa huonyesha kitendo kinachotendwa kwa mtu au kitu. Mfano: "Chakula kimepakuliwa na mama".
24
+
25
+ User: Taja vitenzi vya nyakati tofauti
26
+ Bot: 1. Wakati uliopita: "nilisoma"
27
+ 2. Wakati ujao: "nitasoma"
28
+ 3. Wakati uliopo: "ninasoma"
29
+
30
+ User: Nini tofauti kati ya "huwa" na "awa"?
31
+ Bot: "Huwa" ni kiwakilishi cha mtu wa tatu wingi (they). "Awa" si neno sahihi cha Kiswahili.
32
+
33
+ User: Je, "kula" na "mla" ni vitenzi sawa?
34
+ Bot: Hapana, "kula" ni kitenzi cha msingi (to eat). "Mla" ni neno linalomaanisha "mlaji" (eater) au hutumika kwa ngeli ya M-WA.
35
+
36
+ User: Nini maana ya "na" katika sentensi?
37
+ Bot: "Na" inaweza kuwa kiunganishi (and) au nyakati ya kisasa (present continuous). Mfano: "Juma na Asha" (and) au "Anasoma" (is reading).
38
+
39
+ User: Vitenzi vya kutenda na kutendwa
40
+ Bot: Kutenda: "Mwalimu anafundisha wanafunzi". Kutendwa: "Wanafunzi wanafundishwa na mwalimu".
41
+
42
+ User: Nini maana ya kiulizi?
43
+ Bot: Kiulizi ni neno linalotumika kuuliza swali. Kama: nani, nini, lini, wapi, vipi, kwa nini, gani.
44
+
45
+ User: Tofautisha kati ya "hii" na "hili"
46
+ Bot: "Hii" hutumika kwa vitu vya ngeli ya I-ZI. "Hili" hutumika kwa vitu vya ngeli ya LI-YA.
47
+
48
+ User: Mfano wa sentensi yenye kihusishi
49
+ Bot: Kihusishi ni neno kama "ya", "za", "la". Mfano: "Kitabu cha Juma", "Nyumba za watu".
50
+
51
+ User: Nini maana ya kiwakilishi cha kutaja tena?
52
+ Bot: Ni kiwakilishi kinachotumika badala ya jina lililotajwa tayari. Mfano: "Juma amekuja. Yeye ni mwalimu".
53
+
54
+ User: Taja aina za maneno
55
+ Bot: Kuna: nomino (nouns), vitenzi (verbs), vivumishi (adjectives), viwakilishi (pronouns), vihisishi (adverbs), viunganishi (conjunctions), vihisishi (prepositions), vikundi (interjections).
56
+
57
+ User: Nini tofauti kati ya "kwanza" na "kwanza"?
58
+ Bot: "Kwanza" inamaanisha "first". "Kwanza" si neno sahihi la Kiswahili; sahihi ni "kuanza" (to begin).
59
+
60
+ User: Mfano wa sentensi yenye kirejeshi
61
+ Bot: Kirejeshi ni neno kama "ambaye", "ambacho", "ambao". Mfano: "Mtoto ambaye anasoma", "Kitabu ambacho nasoma".
training_data/greetings.txt ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ User: Habari yako?
2
+ Bot: Nzuri sana, na wewe je?
3
+
4
+ User: Hujambo?
5
+ Bot: Sijambo!
6
+
7
+ User: U hali gani?
8
+ Bot: Niko salama, na wewe je?
9
+
10
+ User: Mambo?
11
+ Bot: Poa!
12
+
13
+ User: Vipi?
14
+ Bot: Safi!
15
+
16
+ User: Shikamoo?
17
+ Bot: Marahaba!
18
+
19
+ User: Hamjambo?
20
+ Bot: Hatujambo!
21
+
22
+ User: Umeamkaje?
23
+ Bot: Nimeamka salama, na wewe umeamkaje?
24
+
25
+ User: Umeshindaje?
26
+ Bot: Nimeshinda vizuri, na wewe umeshindaje?
27
+
28
+ User: Hali za leo?
29
+ Bot: Njema sana, na wewe hali zako?
30
+
31
+ User: Habari za asubuhi?
32
+ Bot: Nzuri sana, habari za asubuhi!
33
+
34
+ User: Habari za mchana?
35
+ Bot: Njema, habari za mchana!
36
+
37
+ User: Habari za jioni?
38
+ Bot: Zipo vizuri, habari za jioni!
39
+
40
+ User: Habari za kazi?
41
+ Bot: Kazi iendelee, Mungu akipenda!
42
+
43
+ User: Habari za nyumbani?
44
+ Bot: Zote vizuri, Mungu akipenda.
45
+
46
+ User: Habari za familia?
47
+ Bot: Wote wazima, na wa kwenu?
48
+
49
+ User: Habari za safari?
50
+ Bot: Safari salama, Mungu akipenda.
51
+
52
+ User: Umetoka wapi?
53
+ Bot: Nimetoka nyumbani/kazini/sokoni.
54
+
55
+ User: Unaenda wapi?
56
+ Bot: Naenda nyumbani/kazini/sokoni.
57
+
58
+ User: Uko poa?
59
+ Bot: Poa kama ndizi!
60
+
61
+ User: Mzima?
62
+ Bot: Mzima wa afya!
63
+
64
+ User: Maisha yanaendelea vipi?
65
+ Bot: Yanadunda, Mungu akipenda.
66
+
67
+ User: Mungu awabariki
68
+ Bot: Amen, na akubariki wewe pia.
69
+
70
+ User: Umepotea wapi?
71
+ Bot: Samahani, nilikuwa na shughuli nyingi.
72
+
73
+ User: Karibu
74
+ Bot: Asante sana kwa kunikaribisha.
75
+
76
+ User: Asante
77
+ Bot: Karibu sana.
78
+
79
+ User: Asante sana
80
+ Bot: Karibu tena.
81
+
82
+ User: Pole na masaa
83
+ Bot: Hamna shida, hakuna haraka.
84
+
85
+ User: Pole na kazi
86
+ Bot: Asante, kazi ni maisha.
87
+
88
+ User: Hongera
89
+ Bot: Asante sana, Mungu akipendelea.
90
+
91
+ User: Usiku mwema
92
+ Bot: Usiku mwema, lala salama.
93
+
94
+ User: Kwaheri
95
+ Bot: Kwaheri, tutaonana tena.
96
+
97
+ User: Tutaonana
98
+ Bot: Ndio, tutaonana baadaye.
99
+
100
+ User: Baadaye
101
+ Bot: Sawa, baadaye.
102
+
103
+ User: Kesho
104
+ Bot: Sawa, kesho.
105
+
106
+ User: Haya
107
+ Bot: Sawa, tuendelee.
108
+
109
+ User: Sawa
110
+ Bot: Sawa kabisa.
111
+
112
+ User: Ahsante kwa msaada
113
+ Bot: Karibu sana, ni furaha yangu.
114
+
115
+ User: Samahani
116
+ Bot: Hamna shida, pole sana.
117
+
118
+ User: Pole
119
+ Bot: Asante, Mungu atasaidia.
training_data/vocubulary.txt ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ User: Nini maana ya "haraka"?
2
+ Bot: "Haraka" inamaanisha kufanya kitu kwa spidi au bila kuchelewesha. Kwa Kiingereza: "quickly" or "fast".
3
+
4
+ User: Nini maana ya "polepole"?
5
+ Bot: "Polepole" inamaanisha kufanya kitu kwa utulivu au kwa mwendo wa chini. Kwa Kiingereza: "slowly".
6
+
7
+ User: Tafsiri "house" kwa Kiswahili
8
+ Bot: "House" inatafsiriwa kama "nyumba" kwa Kiswahili.
9
+
10
+ User: Nini maana ya "chakula"?
11
+ Bot: "Chakula" ni kitu tunachokula kwa kulishiba au kupata nishati. Kwa Kiingereza: "food".
12
+
13
+ User: Tofautisha kati ya "mkate" na "chapo"
14
+ Bot: "Mkate" ni vyakula vilivyookwa kwa unga wa ngano. "Chapo" ni aina ya mkate ulio na mafuta na umekatwa kwa vipande.
15
+
16
+ User: Nini maana ya "maji"?
17
+ Bot: "Maji" ni kiowevu kisicho na rangi au harufu kinachotiririka. Kwa Kiingereza: "water".
18
+
19
+ User: Tafsiri "school" kwa Kiswahili
20
+ Bot: "School" inatafsiriwa kama "shule" kwa Kiswahili.
21
+
22
+ User: Nini maana ya "rafiki"?
23
+ Bot: "Rafiki" ni mtu unaoyefahamiana naye vizuri na una uhusiano mwema. Kwa Kiingereza: "friend".
24
+
25
+ User: Taja maneno matatu yanayohusiana na "mlimajengu"
26
+ Bot: 1. Mlima - mountain
27
+ 2. Kilele - peak
28
+ 3. Mtelemko - slope
29
+
30
+ User: Nini maana ya "soko"?
31
+ Bot: "Soko" ni mahala panapouzwa na kununuliwa bidhaa mbalimbali. Kwa Kiingereza: "market".
32
+
33
+ User: Tafsiri "book" kwa Kiswahili
34
+ Bot: "Book" inatafsiriwa kama "kitabu" kwa Kiswahili.
35
+
36
+ User: Nini maana ya "mwalimu"?
37
+ Bot: "Mwalimu" ni mtu anayefundisha watu wengine. Kwa Kiingereza: "teacher".
38
+
39
+ User: Tofautisha kati ya "meza" na "kiti"
40
+ Bot: "Meza" ni fanicha ya gorofa inayotumika kuwekea vitu. "Kiti" ni fanicha ya kukalia.
41
+
42
+ User: Nini maana ya "moto"?
43
+ Bot: "Moto" inaweza kumaanisha "fire" (moto) au "child" (mtoto mdogo) kulingana na muktadha.
44
+
45
+ User: Tafsiri "family" kwa Kiswahili
46
+ Bot: "Family" inatafsiriwa kama "familia" au "jamii" kwa Kiswahili.
47
+
48
+ User: Nini maana ya "kupika"?
49
+ Bot: "Kupika" ni kitendo cha kuandaa chakula kwa kutumia joto. Kwa Kiingereza: "to cook".
50
+
51
+ User: Taja maneno matatu ya rangi
52
+ Bot: 1. Nyekundu - red
53
+ 2. Blu - blue
54
+ 3. Kijani - green
55
+
56
+ User: Nini maana ya "mji"?
57
+ Bot: "Mji" ni eneo kubwa lenye watu wengi na majengo. Kwa Kiingereza: "city" or "town".
58
+
59
+ User: Tafsiri "animal" kwa Kiswahili
60
+ Bot: "Animal" inatafsiriwa kama "mnyama" kwa Kiswahili.
61
+
62
+ User: Nini maana ya "mvua"?
63
+ Bot: "Mvua" ni maji yanayomwagika kutoka angani kama matone. Kwa Kiingereza: "rain".
64
+
65
+ User: Tofautisha kati ya "mto" na "ziwa"
66
+ Bot: "Mto" ni mkondo wa maji unaotiririka. "Ziwa" ni mkusanyiko wa maji uliowekwa kwenye bonde.
67
+
68
+ User: Nini maana ya "jua"?
69
+ Bot: "Jua" ni nyota inayotoa mwanga na joto duniani. Kwa Kiingereza: "sun".
70
+
71
+ User: Tafsiri "clothes" kwa Kiswahili
72
+ Bot: "Clothes" inatafsiriwa kama "nguo" kwa Kiswahili.
73
+
74
+ User: Nini maana ya "kulala"?
75
+ Bot: "Kulala" ni kitendo cha kupumzika kwa macho yamefungwa. Kwa Kiingereza: "to sleep".
76
+
77
+ User: Taja maneno matatu ya nambari
78
+ Bot: 1. Moja - one
79
+ 2. Mbili - two
80
+ 3. Tatu - three
81
+
82
+ User: Nini maana ya "barabara"?
83
+ Bot: "Barabara" ni njia pana iliyotengenezwa kwa magari kupita. Kwa Kiingereza: "road".
84
+
85
+ User: Tafsiri "money" kwa Kiswahili
86
+ Bot: "Money" inatafsiriwa kama "pesa" kwa Kiswahili.
87
+
88
+ User: Nini maana ya "kusoma"?
89
+ Bot: "Kusoma" inaweza kumaanisha kusoma vitabu au kusoma shule. Kwa Kiingereza: "to read" or "to study".
90
+
91
+ User: Tofautisha kati ya "mwezi" na "mwanga"
92
+ Bot: "Mwezi" ni mwangaza unaotoka kwenye mwezi usiku. "Mwanga" ni uangaza wowote unaotoka kwenye vyanzo mbalimbali.