Spaces:
Runtime error
Runtime error
dkoshman
commited on
Commit
·
8ab1767
1
Parent(s):
ae308b4
trying to make data generator work from remote
Browse files- data_generator.py +35 -17
- data_preprocessing.py +1 -1
data_generator.py
CHANGED
@@ -4,6 +4,7 @@ import os
|
|
4 |
import string
|
5 |
import subprocess
|
6 |
import random
|
|
|
7 |
|
8 |
|
9 |
class DotDict(dict):
|
@@ -101,28 +102,32 @@ def generate_equation(latex: DotDict, size, depth=3):
|
|
101 |
return equation
|
102 |
|
103 |
|
104 |
-
def generate_image(directory: str,
|
|
|
|
|
|
|
105 |
"""
|
106 |
Generates a random tex file and corresponding image
|
107 |
-------
|
108 |
params:
|
109 |
:directory: -- dir where to save files
|
110 |
-
:
|
111 |
-
:filename: --
|
112 |
:max_length: -- max size of equation
|
|
|
|
|
|
|
113 |
"""
|
114 |
-
# TODO ARGPARSE
|
115 |
-
filepath = directory
|
116 |
|
117 |
-
|
118 |
-
latex = json.load(file)
|
119 |
-
latex = DotDict(latex)
|
120 |
|
121 |
template = string.Template(latex.template)
|
122 |
font, font_options = random.choice(latex.fonts)
|
123 |
font_option = random.choice([''] + font_options)
|
124 |
fontsize = random.choice(latex.fontsizes)
|
125 |
-
equation = generate_equation(latex,
|
126 |
tex = template.substitute(font=font, font_option=font_option, fontsize=fontsize, equation=equation)
|
127 |
|
128 |
files_before = set(os.listdir(directory))
|
@@ -130,7 +135,7 @@ def generate_image(directory: str, latex_path: str, filename: str, max_length=20
|
|
130 |
file.write(tex)
|
131 |
|
132 |
pr1 = subprocess.run(
|
133 |
-
f"pdflatex -output-directory={directory} {filepath}.tex".split(),
|
134 |
stderr=subprocess.PIPE,
|
135 |
)
|
136 |
|
@@ -138,23 +143,23 @@ def generate_image(directory: str, latex_path: str, filename: str, max_length=20
|
|
138 |
if pr1.returncode != 0:
|
139 |
files_to_delete = files_after - files_before
|
140 |
if files_to_delete:
|
141 |
-
subprocess.run(['rm'] + [directory
|
142 |
print(pr1.stderr.decode(), tex)
|
143 |
return
|
144 |
|
145 |
pr2 = subprocess.run(
|
146 |
-
f"
|
147 |
stderr=subprocess.PIPE,
|
148 |
)
|
149 |
|
150 |
files_to_delete = files_after - files_before - {filename + '.png', filename + '.tex'}
|
151 |
if files_to_delete:
|
152 |
-
subprocess.run(['rm'] + [directory
|
153 |
assert (pr2.returncode == 0)
|
154 |
|
155 |
|
156 |
def generate_data(
|
157 |
-
filenames:
|
158 |
directory: str,
|
159 |
latex_path: str,
|
160 |
overwrite: bool = False
|
@@ -168,14 +173,27 @@ def generate_data(
|
|
168 |
:latex_path: - full path to latex json
|
169 |
:overwrite: - whether to overwrite existing files
|
170 |
"""
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
filenames = set(filenames)
|
173 |
if not overwrite:
|
174 |
-
existing = set(
|
|
|
|
|
175 |
filenames -= existing
|
176 |
|
|
|
|
|
|
|
|
|
177 |
while filenames:
|
178 |
-
|
179 |
-
|
|
|
|
|
180 |
existing = set(file.split('.')[0] for file in os.listdir(directory) if file.endswith('.png'))
|
181 |
filenames -= existing
|
|
|
4 |
import string
|
5 |
import subprocess
|
6 |
import random
|
7 |
+
from typing import Iterable
|
8 |
|
9 |
|
10 |
class DotDict(dict):
|
|
|
102 |
return equation
|
103 |
|
104 |
|
105 |
+
def generate_image(directory: str, latex: DotDict, filename: str, max_length=20, equation_depth=3,
|
106 |
+
pdflatex: str = "/external2/dkkoshman/venv/texlive/2022/bin/x86_64-linux/pdflatex",
|
107 |
+
ghostscript: str = "/external2/dkkoshman/venv/local/gs/bin/gs"
|
108 |
+
):
|
109 |
"""
|
110 |
Generates a random tex file and corresponding image
|
111 |
-------
|
112 |
params:
|
113 |
:directory: -- dir where to save files
|
114 |
+
:latex: -- DotDict with parameters to generate tex
|
115 |
+
:filename: -- absolute filename for the generated files
|
116 |
:max_length: -- max size of equation
|
117 |
+
:equation_depth: -- max nested level of tex scopes
|
118 |
+
:pdflatex: -- path to pdflatex
|
119 |
+
:ghostscript: -- path to ghostscript
|
120 |
"""
|
121 |
+
# TODO ARGPARSE
|
122 |
+
filepath = os.path.join(directory, filename)
|
123 |
|
124 |
+
equation_length = random.randint(1, max_length)
|
|
|
|
|
125 |
|
126 |
template = string.Template(latex.template)
|
127 |
font, font_options = random.choice(latex.fonts)
|
128 |
font_option = random.choice([''] + font_options)
|
129 |
fontsize = random.choice(latex.fontsizes)
|
130 |
+
equation = generate_equation(latex, equation_length, depth=equation_depth)
|
131 |
tex = template.substitute(font=font, font_option=font_option, fontsize=fontsize, equation=equation)
|
132 |
|
133 |
files_before = set(os.listdir(directory))
|
|
|
135 |
file.write(tex)
|
136 |
|
137 |
pr1 = subprocess.run(
|
138 |
+
f"{pdflatex} -output-directory={directory} {filepath}.tex".split(),
|
139 |
stderr=subprocess.PIPE,
|
140 |
)
|
141 |
|
|
|
143 |
if pr1.returncode != 0:
|
144 |
files_to_delete = files_after - files_before
|
145 |
if files_to_delete:
|
146 |
+
subprocess.run(['rm'] + [os.path.join(directory, file) for file in files_to_delete])
|
147 |
print(pr1.stderr.decode(), tex)
|
148 |
return
|
149 |
|
150 |
pr2 = subprocess.run(
|
151 |
+
f"{ghostscript} -sDEVICE=png16m -dTextAlphaBits=4 -r200 -dSAFER -dBATCH -dNOPAUSE -o {filepath}.png {filepath}.pdf".split(),
|
152 |
stderr=subprocess.PIPE,
|
153 |
)
|
154 |
|
155 |
files_to_delete = files_after - files_before - {filename + '.png', filename + '.tex'}
|
156 |
if files_to_delete:
|
157 |
+
subprocess.run(['rm'] + [os.path.join(directory, file) for file in files_to_delete])
|
158 |
assert (pr2.returncode == 0)
|
159 |
|
160 |
|
161 |
def generate_data(
|
162 |
+
filenames: Iterable[str],
|
163 |
directory: str,
|
164 |
latex_path: str,
|
165 |
overwrite: bool = False
|
|
|
173 |
:latex_path: - full path to latex json
|
174 |
:overwrite: - whether to overwrite existing files
|
175 |
"""
|
176 |
+
subprocess.run(". /external2/dkkoshman/venv/bin/activate")
|
177 |
+
if not os.path.isabs(directory):
|
178 |
+
directory = os.path.join(os.getcwd(), directory)
|
179 |
+
if not os.path.isabs(latex_path):
|
180 |
+
latex_path = os.path.join(os.getcwd(), latex_path)
|
181 |
|
182 |
filenames = set(filenames)
|
183 |
if not overwrite:
|
184 |
+
existing = set(
|
185 |
+
filename for file in os.listdir(directory) for filename, ext in os.path.splitext(file) if ext == '.png'
|
186 |
+
)
|
187 |
filenames -= existing
|
188 |
|
189 |
+
with open(latex_path) as file:
|
190 |
+
latex = json.load(file)
|
191 |
+
latex = DotDict(latex)
|
192 |
+
|
193 |
while filenames:
|
194 |
+
for name in filenames:
|
195 |
+
generate_image(directory, latex, name)
|
196 |
+
# with Pool() as pool:
|
197 |
+
# pool.starmap(generate_image, ((directory, latex, name) for name in filenames))
|
198 |
existing = set(file.split('.')[0] for file in os.listdir(directory) if file.endswith('.png'))
|
199 |
filenames -= existing
|
data_preprocessing.py
CHANGED
@@ -23,7 +23,7 @@ class TexImageDataset(Dataset):
|
|
23 |
torch.multiprocessing.set_sharing_strategy('file_system')
|
24 |
self.root_dir = root_dir
|
25 |
self.filenames = sorted(set(
|
26 |
-
os.
|
27 |
))
|
28 |
self.image_transform = image_transform
|
29 |
self.tex_transform = tex_transform
|
|
|
23 |
torch.multiprocessing.set_sharing_strategy('file_system')
|
24 |
self.root_dir = root_dir
|
25 |
self.filenames = sorted(set(
|
26 |
+
filename for file in os.listdir(root_dir) for filename, ext in os.path.splitext(file) if ext == '.png'
|
27 |
))
|
28 |
self.image_transform = image_transform
|
29 |
self.tex_transform = tex_transform
|