Coverage for modules/org/openteacher/logic/loaders/voca/voca: 92%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

#! /usr/bin/env python3

# -*- coding: utf-8 -*-

# This file is part of OpenTeacher.

# OpenTeacher is free software: you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by

# the Free Software Foundation, either version 3 of the License, or

# (at your option) any later version.

# OpenTeacher is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License

# along with OpenTeacher. If not, see <http://www.gnu.org/licenses/>.

#The comments describing the file format are taken from Voca's source

#code. For that, Dennis Hofs is mentioned in the copyright section of

#this file (licenses of both projects match). The comments came from the

#following files:

#- WordListSerialiserImpl40.cs

#- SerialiserFactoryImpl40.cs

#- SerialiserFactoryImpl30.cs

#- GrammarSerialiser.cs

#- EmbeddedFileSerialiser.cs

#- ViewStateSerialiserImpl40.cs

#- WordListSerialiserImpl10.cs

#- SerialiserFactoryImpl10.cs

#- CharsetEncodings.cs

import struct

import collections

import itertools

class Reader:

def __init__(self, data, *args, **kwargs):

super().__init__(*args, **kwargs)

self._data = data

self._pos = 0

@property

def atEnd(self):

return self._pos == len(self._data)

def skip(self, fmt):

size = struct.calcsize(fmt)

self._pos += size

def read(self, fmt):

startPos = self._pos

self.skip(fmt)

return struct.unpack(fmt, self._data[startPos:self._pos])[0]

readInt = lambda self: self.read(">i")

exit readChar = lambda self: self.read(">b")

readUnsignedChar = lambda self: self.read(">B")

readLongLong = lambda self: self.read(">q")

readBool = lambda self: self.read(">?")

readBytes = lambda self, length: self.read(">%ss" % length)

skipInt = lambda self: self.skip(">i")

skipChar = lambda self: self.skip(">b")

skipBool = lambda self: self.skip(">?")

skipBytes = lambda self, length: self.skip(">%ss" % length)

class VocaLoaderModule:

def __init__(self, moduleManager, *args, **kwargs):

super().__init__(*args, **kwargs)

self.type = "load"

self.priorities = {

"default": 432,

}

self._mm = moduleManager

self.uses = (

self._mm.mods(type="translator"),

)

self.requires = (

self._mm.mods(type="wordsStringParser"),

self._mm.mods(type="mimicryTypefaceConverter"),

)

self.filesWithTranslations = ("voca.py",)

@property

def _convertMimicryTypeface(self):

return self._modules.default("active", type="mimicryTypefaceConverter").convert

@property

def _parse(self):

return self._modules.default("active", type="wordsStringParser").parse

def _retranslate(self):

try:

translator = self._modules.default("active", type="translator")

except IndexError:

_, ngettext = str, lambda a, b, n: a if n == 1 else b

else:

_, ngettext = translator.gettextFunctions(

self._mm.resourcePath("translations")

)

#TRANSLATORS: This is one of the file formats OpenTeacher

#TRANSLATORS: can read. It's named after the program that uses

#TRANSLATORS: it. See http://www.oriente-voca.eu/ for more info

#TRANSLATORS: on it.

self.name = _("Voca")

def enable(self):

self.loads = {"wdl": ["words"]}

self.mimetype = "application/x-oriente-voca"

self._modules = set(self._mm.mods(type="modules")).pop()

try:

translator = self._modules.default("active", type="translator")

except IndexError:

pass

else:

translator.languageChanged.handle(self._retranslate)

self._retranslate()

self.active = True

def disable(self):

self.active = False

del self.name

del self.loads

del self.mimetype

del self._modules

if hasattr(self, "_reader"): # pragma: no cover

del self._reader

def getFileTypeOf(self, path):

exit if path.endswith(".wdl"):

return "words"

def _readUtf8String(self):

size = self._reader.readInt()

return self._reader.readBytes(size).decode("UTF-8")

def _skipUtf8String(self):

size = self._reader.readInt()

self._reader.skipBytes(size)

def _readLang(self):

#this is found directly after the header

#(name + version numbers)

#note this function only parses one of the two blocks at once.

# 4 foreign language name size (int)

# * foreign language name (UTF-8 string)

# 4 foreign font name size (int)

# * foreign font name (UTF-8 string)

# 4 foreign font size

# 4 reference language name size (int)

# * reference language name (UTF-8 string)

# 4 reference font name size (int)

# * reference font name (UTF-8 string)

# 4 reference font size

lang = self._readUtf8String()

font = self._readUtf8String()

self._reader.skipInt()

return (lang, font)

def _skipGrammar(self):

# 4 grammar item count (int)

# (

itemCount = self._reader.readInt()

for i in range(itemCount):

# 4 category size (int)

# * category (UTF-8 string)

# 4 value size (int)

# * value (UTF-8 string)

for i in range(2):

self._skipUtf8String()

# )*

def _skipEmbeddedFile(self):

# (

# (file path is an empty string)

# 8 [ 0 ] (long)

# |

# (file path is not an empty string)

# 8 file size (long)

# 4 extension size (int)

# * extension (UTF-8 string)

# * file data (bytes with length of file size)

# )

fileSize = self._reader.readLongLong()

203 if fileSize:

self._skipUtf8String()

self._reader.skipBytes(fileSize)

def _readLangs(self, list):

answerLanguage, answerFont = self._readLang()

questionLanguage, questionFont = self._readLang()

list["questionLanguage"] = questionLanguage

list["answerLanguage"] = answerLanguage

return (questionFont, answerFont)

def _skipHeader(self):

#0xA is the header length

self._reader.skipBytes(0xA)

def _skipPhoneticAndForeignChars(self):

# 4 phonetic font name size (int)

# * phonetic font name (UTF-8 string)

# 4 phonetic font size

# 4 foreign characters size (int)

# * foreign characters (UTF-8 string)

#skip it all

self._skipUtf8String()

self._reader.skipInt()

self._skipUtf8String()

def _skipPartOfSpeech(self):

# 4 part of speech count (int)

# (

# 4 part of speech size (int)

# * part of speech (UTF-8 string)

# 4 grammar category count (int)

# (

# 4 grammar category size (int)

# * grammar category (UTF-8 string)

# )*

#skip it all. Bit harder this time.

partOfSpeechCount = self._reader.readInt()

for i in range(partOfSpeechCount):

self._skipUtf8String()

grammarCategoryCount = self._reader.readInt()

for i in range(grammarCategoryCount):

self._skipUtf8String()

def _skipUploaded(self):

# 1 uploaded (bool)

# (

# (if uploaded)

# 4 upload name size (int)

# * upload name (UTF-8 string)

# 1 has foreign language ISO code (bool)

# (

# (if has foreign language ISO code)

# 3 foreign language ISO code (char[])

# )?

# 1 has reference language ISO code (bool)

# (

# (if has reference language ISO code)

# 3 reference language ISO code (char[])

# )?

# 4 course size (int)

# * course (UTF-8 string)

# 4 comments size (int)

# * comments (UTF-8 string)

# )?

#skip it all. Starting to be annoying...

uploaded = self._reader.readBool()

277 if uploaded:

self._skipUtf8String()

for i in range(2):

hasLanguageIsoCode = self._reader.readBool()

if hasLanguageIsoCode:

self._reader.skipBytes(3)

for i in range(2):

self._skipUtf8String()

def _skip40OnlyExerciseInfo(self):

# 4 mastered score (1-10) (int)

# 4 maximum mastered chance (0 for 0%, 100 for 100%) (int)

# 4 exam count (int)

#skip all

for i in range(3):

self._reader.skipInt()

def _skipCommonExerciseInfo(self):

# 4 active exercise type (int)

# 4 exercise type count (int)

# (

self._reader.skipInt()

exerciseTypeCount = self._reader.readInt()

for i in range(exerciseTypeCount):

# (

# 1 false

# 4 exercise type name size (int)

# * exercise type name (UTF-8 string)

# )

# |

# (

# 1 true

# 4 size of language pack path to exercise type name (int)

# * language pack path to exercise type name (UTF-8 string)

# )

#skip it

self._reader.skipBool()

self._skipUtf8String()

# 4 answer type size (int)

# * answer type (UTF-8 string)

# 4 exercise configuration count (int)

# (

self._skipUtf8String()

exerciseConfigCount = self._reader.readInt()

for i in range(exerciseConfigCount):

# (

# 1 false

# 4 exercise configuration name size (int)

# * exercise configuration name (UTF-8 string)

# )

# |

# (

# 1 true

# 4 size of language pack path to exercise configuration name (int)

# * language pack path to exercise configuration name (UTF-8 string)

# )

self._reader.skipBool()

self._skipUtf8String()

# 4 question field count (int)

# (

# 4 question field size (int)

# * question field (UTF-8 string)

# )*

# 4 answer field count (int)

# (

# 1 check answer field (bool)

# 4 answer field size (int)

# * answer field (UTF-8 string)

# )*

# 4 info field count (int)

# (

# 4 info field size (int)

# * info field (UTF-8 string)

# )*

questionFieldCount = self._reader.readInt()

for i in range(questionFieldCount):

self._skipUtf8String()

answerFieldCount = self._reader.readInt()

for i in range(answerFieldCount):

self._reader.skipBool()

self._skipUtf8String()

infoFieldCount = self._reader.readInt()

for i in range(infoFieldCount):

self._skipUtf8String()

# )*

def _readItems(self, list, questionFont, answerFont, loadImages):

#Finally. Some interesting stuff again.

# 4 item count (int)

# (

itemCount = self._reader.readInt()

for wordId in range(itemCount):

# 4 sequence (int)

# 4 word size (int)

# * word (UTF-8 string)

# 1 word score (byte)

# 4 word trans size (int)

# * word trans (UTF-8 string)

# 1 word trans score (byte)

# 4 context size (int)

# * context (UTF-8 string)

# 1 context score (byte)

# 4 phonetic size (int)

# * phonetic (UTF-8 string)

# 1 phonetic score (byte)

# 4 part of speech size (int)

# * part of speech (UTF-8 string)

# * grammar (see GrammarSerialiser)

# 1 grammar score (byte)

# * media (see EmbeddedFileSerialiser)

# 1 media score (byte)

# * image (see EmbeddedFileSerialiser) -- ATTENTION: This is 4.0 only

# 1 image score (byte) -- ATTENTION: This is 4.0 only

# 4 lesson size (int)

# * lesson (UTF-8 string)

# 4 page size (int)

# S* page (UTF-8 string)

self._reader.skipInt()

answer = self._readUtf8String()

self._reader.skipChar()

question = self._readUtf8String()

self._reader.skipChar()

for i in range(2):

self._skipUtf8String()

self._reader.skipChar()

self._skipUtf8String()

self._skipGrammar()

self._reader.skipChar()

self._skipEmbeddedFile()

self._reader.skipChar()

if loadImages:

self._skipEmbeddedFile()

self._reader.skipChar()

for i in range(2):

self._skipUtf8String()

#convert mimicry font symbols

question = self._convertMimicryTypeface(questionFont, question)

answer = self._convertMimicryTypeface(answerFont, answer)

list["items"].append({

"id": wordId,

"questions": self._parse(question),

"answers": self._parse(answer),

})

# )*

def _parse40(self):

list = {

"items": [],

"resources": [],

}

self._skipHeader()

questionFont, answerFont = self._readLangs(list)

self._skipPhoneticAndForeignChars()

self._skipPartOfSpeech()

self._skipUploaded()

self._skip40OnlyExerciseInfo()

self._skipCommonExerciseInfo()

self._readItems(list, questionFont, answerFont, loadImages=True)

#We've got all the interesting stuff by now. Just in case I

#missed something, the remaining part of the file format

#description:

# Reads or writes a global view state in the following format:

# 1 mode (MainForm.Mode)

# 1 apply filters (bool)

# 4 filter count (int)

# (

# 1 filter class (0 = StringFilter,

# 1 = BooleanFilter,

# 2 = StringListFilter)

# 4 type size (int)

# * type (UTF-8 string)

# (

# (StringFilter)

# 1 search type (StringFilter.SearchTypes)

# 4 string size (int)

# * string (UTF-8 string)

# |

# (BooleanFilter)

# 1 converter type (0 = EmbeddedFileBooleanConverter)

# 1 value (bool)

# |

# (StringListFilter)

# 4 string count (int)

# (

# 4 string size (int)

# * string (UTF-8 string)

# )*

# )

# )*

# 4 applied filter count (int)

# (

# 4 string size (int)

# * applied filter type (UTF-8 string)

# )*

# 4 unsatisfied filter count (int)

# (

# 4 string size (int)

# * unsatisfied filter type (UTF-8 string)

# )*

# 4 dirty indices count (int)

# (

# 4 dirty index (int)

# )*

# 4 column count (int)

# (

# 4 type size

# * type (UTF-8 string)

# 1 visible (bool)

# 4 width (int)

# )*

# 1 selection type (Selection.SelectionType)

# (

# (CellSelection)

# 4 row (int)

# 4 column (int)

# |

# (RowSelection)

# 4 first row (int)

# 4 last row (int)

# )?

# 4 top row (int)

# 4 left column (int)

# 1 has editor view state (bool)

# (

# 1 editor view state class (0 = DefaultViewState, 1 = TextViewState)

# (

# (TextViewState)

# 4 selection start (int)

# 4 selection length (int)

# )?

# (

# 1 input language set (bool)

# (

# (if input language set)

# 4 culture size (int)

# * culture (UTF-8 string)

# 4 layout size (int)

# * layout (UTF-8 string)

# )?

# ){3} (for input modes: foreign, reference, phonetic)

return list

def _parse30(self):

list = {

"items": [],

"resources": [],

}

self._skipHeader()

questionFont, answerFont = self._readLangs(list)

self._skipPhoneticAndForeignChars()

self._skipPartOfSpeech()

self._skipUploaded()

self._skipCommonExerciseInfo()

self._readItems(list, questionFont, answerFont, loadImages=False)

return list

def _readNullTerminatedString(self):

string = b''

while True:

char = self._reader.readUnsignedChar()

if not char:

break

string += bytes([char])

return string

def _skipNullTerminatedString(self):

while True:

char = self._reader.readUnsignedChar()

if not char:

break

def _parse10(self):

list = {

"items": [],

"resources": [],

}

# 22 [ 2E F6 26 4F 25 BD 2C 59 03 8F 59 AA 1E 34 46 07 6C 4A 44 CF 28 00 ]

magic = self._reader.readBytes(22)

#check magic id

assert(magic == bytes.fromhex("2ef6264f25bd2c59038f59aa1e3446076c4a44cf2800"))

# * foreign language (null-terminated string)

# 1 foreign font charset (charset)

# * foreign font name (null-terminated string)

# 1 foreign font size (byte)

# * reference language (null-terminated string)

# 1 reference font charset (charset)

# * reference font name (null-terminated string)

# 1 reference font size (byte)

answerLang = self._readNullTerminatedString()

answerEncoding = self._readCharset()

list["answerLanguage"] = answerLang.decode(answerEncoding)

answerFont = self._readNullTerminatedString()

self._reader.skipChar()

questionLang = self._readNullTerminatedString()

questionEncoding = self._readCharset()

list["questionLanguage"] = questionLang.decode(questionEncoding)

questionFont = self._readNullTerminatedString()

self._reader.skipChar()

# * foreign characters (null-terminated string)

self._skipNullTerminatedString()

# 2 [ 0D 00 ]

twoBytes = self._reader.readBytes(2)

assert(twoBytes == bytes.fromhex("0d00"))

# The format of a list of items is as follows. The end of the list

# should correspond to the end of the stream.

# (

# * word (null-terminated string)

# * word trans (null-terminated string)

# * context (null-terminated string)

# * part of speech (null-terminated string)

# * lesson (null-terminated string)

# 4 sound size (int)

# * sound (bytes)

# 1 foreign to reference set (exercise level)

# 1 reference to foreign set (exercise level)

# 1 foreign to reference progress (byte, max 30)

# 1 reference to foreign progress (byte, max 30)

# 1 foreign to reference asked (byte, max 5)

# 1 reference to foreign asked (byte, max 5)

# 1 foreign to reference score (byte, 5 low order bits are answers to last questions)

# 1 reference to foreign score (byte, 5 low order bits are answers to last questions)

# 2 [ 0D 00 ]

# )*

counter = itertools.count()

while not self._reader.atEnd:

answer = self._readNullTerminatedString().decode(answerEncoding)

question = self._readNullTerminatedString().decode(questionEncoding)

for i in range(3):

self._skipNullTerminatedString()

soundSize = self._reader.readInt()

self._reader.skipBytes(soundSize)

self._reader.skipBytes(8)

twoBytes = self._reader.readBytes(2)

assert(twoBytes == bytes.fromhex("0d00"))

#convert mimicry typeface chars into unicode

question = self._convertMimicryTypeface(questionFont, question)

answer = self._convertMimicryTypeface(answerFont, answer)

list["items"].append({

"id": next(counter),

"questions": self._parse(question),

"answers": self._parse(answer),

})

return list

def _readCharset(self):

#public const byte ANSI_CHARSET = 0;

#public const byte DEFAULT_CHARSET = 1;

#public const byte SHIFTJIS_CHARSET = 128;

#public const byte HANGEUL_CHARSET = 129;

#public const byte GB2312_CHARSET = 134;

#public const byte CHINESEBIG5_CHARSET = 136;

#public const byte JOHAB_CHARSET = 130;

#public const byte HEBREW_CHARSET = 177;

#public const byte ARABIC_CHARSET = 178;

#public const byte GREEK_CHARSET = 161;

#public const byte TURKISH_CHARSET = 162;

#public const byte VIETNAMESE_CHARSET = 163;

#public const byte THAI_CHARSET = 222;

#public const byte EASTEUROPE_CHARSET = 238;

#public const byte RUSSIAN_CHARSET = 204;

#public const byte BALTIC_CHARSET = 186;

#case DEFAULT_CHARSET:

#case ANSI_CHARSET:

# return Encoding.GetEncoding("windows-1252");

#case EASTEUROPE_CHARSET:

# return Encoding.GetEncoding("windows-1250");

#case BALTIC_CHARSET:

# return Encoding.GetEncoding("windows-1257");

#case RUSSIAN_CHARSET:

# return Encoding.GetEncoding("windows-1251");

#case GREEK_CHARSET:

# return Encoding.GetEncoding("windows-1253");

#case TURKISH_CHARSET:

# return Encoding.GetEncoding("windows-1254");

#case ARABIC_CHARSET:

# return Encoding.GetEncoding("windows-1256");

#case HEBREW_CHARSET:

# return Encoding.GetEncoding("windows-1255");

#case THAI_CHARSET:

# return Encoding.GetEncoding("windows-874");

#case VIETNAMESE_CHARSET:

# return Encoding.GetEncoding("windows-1258");

#case GB2312_CHARSET:

# return Encoding.GetEncoding("gb2312");

#case CHINESEBIG5_CHARSET:

# return Encoding.GetEncoding("big5");

#case SHIFTJIS_CHARSET:

# return Encoding.GetEncoding("shift_jis");

#case HANGEUL_CHARSET:

# return Encoding.GetEncoding("ks_c_5601-1987");

#case JOHAB_CHARSET:

# return Encoding.GetEncoding("Johab");

#default:

# return Encoding.GetEncoding("windows-1252");

code = self._reader.readUnsignedChar()

return {

0: "windows-1252",

1: "windows-1252",

128: "shift_jis",

129: "ks_c_5601-1987",

134: "gb2312",

136: "big5",

130: "Johab",

177: "windows-1255",

178: "windows-1256",

161: "windows-1253",

162: "windows-1254",

163: "windows-1258",

222: "windows-874",

238: "windows-1250",

204: "windows-1251",

186: "windows-1257",

}[code]

def load(self, path):

with open(path, "rb") as f:

data = f.read()

#header:

# 8 VOCAWRDL

# 1 major version (4)

# 1 minor version (0)

FileStart = collections.namedtuple("FileStart", "magicId majorVersion minorVersion")

start = FileStart._make(struct.unpack(">8sBB", data[:0xA]))

#used by _parse40() and _parse30() (and methods they call)

self._reader = Reader(data)

if start.magicId == b"VOCAWRDL":

if start.majorVersion == 4 and start.minorVersion == 0:

list = self._parse40()

elif start.majorVersion == 3 and start.minorVersion == 0:

list = self._parse30()

else: # pragma: no cover

raise ValueError("Unknown file format version")

else:

#give vocatude a chance on reading the file

list = self._parse10()

return {

"resources": {},

"list": list,

}

def init(moduleManager):

return VocaLoaderModule(moduleManager)

Coverage for modules/org/openteacher/logic/loaders/voca/voca : 92%

249 statements 236 run 13 missing 4 excluded 4 partial