th1enq commited on
Commit
07693f5
·
verified ·
1 Parent(s): 61a0c01

Upload features.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. features.py +347 -0
features.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ from bs4 import BeautifulSoup
3
+
4
+
5
+ with open("mini_dataset/6.html") as f:
6
+ test = f.read()
7
+
8
+ soup = BeautifulSoup(test, "html.parser")
9
+ """
10
+
11
+
12
+ # has_title
13
+ def has_title(soup):
14
+ if soup.title is None:
15
+ return 0
16
+ if len(soup.title.text) > 0:
17
+ return 1
18
+ else:
19
+ return 0
20
+
21
+
22
+ # has_input
23
+ def has_input(soup):
24
+ if len(soup.find_all("input")):
25
+ return 1
26
+ else:
27
+ return 0
28
+
29
+
30
+ # has_button
31
+ def has_button(soup):
32
+ if len(soup.find_all("button")) > 0:
33
+ return 1
34
+ else:
35
+ return 0
36
+
37
+
38
+ # has_image
39
+ def has_image(soup):
40
+ if len(soup.find_all("image")) == 0:
41
+ return 0
42
+ else:
43
+ return 1
44
+
45
+
46
+ # has_submit
47
+ def has_submit(soup):
48
+ for button in soup.find_all("input"):
49
+ if button.get("type") == "submit":
50
+ return 1
51
+ else:
52
+ pass
53
+ return 0
54
+
55
+
56
+ # has_link
57
+ def has_link(soup):
58
+ if len(soup.find_all("link")) > 0:
59
+ return 1
60
+ else:
61
+ return 0
62
+
63
+
64
+ # has_password
65
+ def has_password(soup):
66
+ for input in soup.find_all("input"):
67
+ if (input.get("type") or input.get("name") or input.get("id")) == "password":
68
+ return 1
69
+ else:
70
+ pass
71
+ return 0
72
+
73
+
74
+ # has_email_input
75
+ def has_email_input(soup):
76
+ for input in soup.find_all("input"):
77
+ if (input.get("type") or input.get("id") or input.get("name")) == "email":
78
+ return 1
79
+ else:
80
+ pass
81
+ return 0
82
+
83
+
84
+ # has_hidden_element
85
+ def has_hidden_element(soup):
86
+ for input in soup.find_all("input"):
87
+ if input.get("type") == "hidden":
88
+ return 1
89
+ else:
90
+ pass
91
+ return 0
92
+
93
+
94
+ # has_audio
95
+ def has_audio(soup):
96
+ if len(soup.find_all("audio")) > 0:
97
+ return 1
98
+ else:
99
+ return 0
100
+
101
+
102
+ # has_video
103
+ def has_video(soup):
104
+ if len(soup.find_all("video")) > 0:
105
+ return 1
106
+ else:
107
+ return 0
108
+
109
+
110
+ # number_of_inputs
111
+ def number_of_inputs(soup):
112
+ return len(soup.find_all("input"))
113
+
114
+
115
+ # number_of_buttons
116
+ def number_of_buttons(soup):
117
+ return len(soup.find_all("button"))
118
+
119
+
120
+ # number_of_images
121
+ def number_of_images(soup):
122
+ image_tags = len(soup.find_all("image"))
123
+ count = 0
124
+ for meta in soup.find_all("meta"):
125
+ if meta.get("type") or meta.get("name") == "image":
126
+ count += 1
127
+ return image_tags + count
128
+
129
+
130
+ # number_of_option
131
+ def number_of_option(soup):
132
+ return len(soup.find_all("option"))
133
+
134
+
135
+ # number_of_list
136
+ def number_of_list(soup):
137
+ return len(soup.find_all("li"))
138
+
139
+
140
+ # number_of_TH
141
+ def number_of_TH(soup):
142
+ return len(soup.find_all("th"))
143
+
144
+
145
+ # number_of_TR
146
+ def number_of_TR(soup):
147
+ return len(soup.find_all("tr"))
148
+
149
+
150
+ # number_of_href
151
+ def number_of_href(soup):
152
+ count = 0
153
+ for link in soup.find_all("link"):
154
+ if link.get("href"):
155
+ count += 1
156
+ return count
157
+
158
+
159
+ # number_of_paragraph
160
+ def number_of_paragraph(soup):
161
+ return len(soup.find_all("p"))
162
+
163
+
164
+ # number_of_script
165
+ def number_of_script(soup):
166
+ return len(soup.find_all("script"))
167
+
168
+
169
+ # length_of_title
170
+ def length_of_title(soup):
171
+ if soup.title == None:
172
+ return 0
173
+ return len(soup.title.text)
174
+
175
+
176
+ """
177
+ print("has_title --> ", has_title(soup))
178
+ print("has_input --> ", has_input(soup))
179
+ print("has_button --> ", has_button(soup))
180
+ print("has_image --> ", has_image(soup))
181
+ print("has_submit --> ", has_submit(soup))
182
+ print("has_link --> ", has_link(soup))
183
+ print("has_password --> ", has_password(soup))
184
+ print("has_email_input --> ", has_email_input(soup))
185
+ print("has_hidden_element --> ", has_hidden_element(soup))
186
+ print("has_audio --> ", has_audio(soup))
187
+ print("has_video --> ", has_video(soup))
188
+ print("number_of_inputs --> ", number_of_inputs(soup))
189
+ print("number_of_buttons --> ", number_of_buttons(soup))
190
+ print("number_of_images --> ", number_of_images(soup))
191
+ print("number_of_option --> ", number_of_option(soup))
192
+ print("number_of_list --> ", number_of_list(soup))
193
+ print("number_of_TH --> ", number_of_TH(soup))
194
+ print("number_of_TR --> ", number_of_TR(soup))
195
+ print("number_of_href --> ", number_of_href(soup))
196
+ print("number_of_paragraph --> ", number_of_paragraph(soup))
197
+ print("number_of_script --> ", number_of_script(soup))
198
+ print("length_of_title --> ", length_of_title(soup))
199
+
200
+ """
201
+
202
+
203
+ # has h1
204
+ def has_h1(soup):
205
+ if len(soup.find_all("h1")) > 0:
206
+ return 1
207
+ else:
208
+ return 0
209
+
210
+
211
+ # has h2
212
+ def has_h2(soup):
213
+ if len(soup.find_all("h2")) > 0:
214
+ return 1
215
+ else:
216
+ return 0
217
+
218
+
219
+ # has h3
220
+ def has_h3(soup):
221
+ if len(soup.find_all("h3")) > 0:
222
+ return 1
223
+ else:
224
+ return 0
225
+
226
+
227
+ # length of text
228
+ def length_of_text(soup):
229
+ return len(soup.get_text())
230
+
231
+
232
+ # number of clickable button
233
+ def number_of_clickable_button(soup):
234
+ count = 0
235
+ for button in soup.find_all("button"):
236
+ if button.get("type") == "button":
237
+ count += 1
238
+ return count
239
+
240
+
241
+ # number of a
242
+ def number_of_a(soup):
243
+ return len(soup.find_all("a"))
244
+
245
+
246
+ # number of img
247
+ def number_of_img(soup):
248
+ return len(soup.find_all("img"))
249
+
250
+
251
+ # number of div class
252
+ def number_of_div(soup):
253
+ return len(soup.find_all("div"))
254
+
255
+
256
+ # number of figures
257
+ def number_of_figure(soup):
258
+ return len(soup.find_all("figure"))
259
+
260
+
261
+ # has footer
262
+ def has_footer(soup):
263
+ if len(soup.find_all("footer")) > 0:
264
+ return 1
265
+ else:
266
+ return 0
267
+
268
+
269
+ # has form
270
+ def has_form(soup):
271
+ if len(soup.find_all("form")) > 0:
272
+ return 1
273
+ else:
274
+ return 0
275
+
276
+
277
+ # has textarea
278
+ def has_text_area(soup):
279
+ if len(soup.find_all("textarea")) > 0:
280
+ return 1
281
+ else:
282
+ return 0
283
+
284
+
285
+ # has iframe
286
+ def has_iframe(soup):
287
+ if len(soup.find_all("iframe")) > 0:
288
+ return 1
289
+ else:
290
+ return 0
291
+
292
+
293
+ # has text input
294
+ def has_text_input(soup):
295
+ for input in soup.find_all("input"):
296
+ if input.get("type") == "text":
297
+ return 1
298
+ return 0
299
+
300
+
301
+ # number of meta
302
+ def number_of_meta(soup):
303
+ return len(soup.find_all("meta"))
304
+
305
+
306
+ # has nav
307
+ def has_nav(soup):
308
+ if len(soup.find_all("nav")) > 0:
309
+ return 1
310
+ else:
311
+ return 0
312
+
313
+
314
+ # has object
315
+ def has_object(soup):
316
+ if len(soup.find_all("object")) > 0:
317
+ return 1
318
+ else:
319
+ return 0
320
+
321
+
322
+ # has picture
323
+ def has_picture(soup):
324
+ if len(soup.find_all("picture")) > 0:
325
+ return 1
326
+ else:
327
+ return 0
328
+
329
+
330
+ # number of sources
331
+ def number_of_sources(soup):
332
+ return len(soup.find_all("source"))
333
+
334
+
335
+ # number of span
336
+ def number_of_span(soup):
337
+ return len(soup.find_all("span"))
338
+
339
+
340
+ # number of table
341
+ def number_of_table(soup):
342
+ return len(soup.find_all("table"))
343
+
344
+
345
+
346
+
347
+