Coverage for modules/org/openteacher/logic/loaders/voca/voca : 92%
Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
|
#! /usr/bin/env python3 # -*- coding: utf-8 -*-
# Copyright 2011, Milan Boers # Copyright 2011-2012, Marten de Vries # Copyright 2009, Dennis Hofs # # This file is part of OpenTeacher. # # OpenTeacher is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # OpenTeacher is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with OpenTeacher. If not, see <http://www.gnu.org/licenses/>.
#The comments describing the file format are taken from Voca's source #code. For that, Dennis Hofs is mentioned in the copyright section of #this file (licenses of both projects match). The comments came from the #following files: # #- WordListSerialiserImpl40.cs #- SerialiserFactoryImpl40.cs #- SerialiserFactoryImpl30.cs #- GrammarSerialiser.cs #- EmbeddedFileSerialiser.cs #- ViewStateSerialiserImpl40.cs #- WordListSerialiserImpl10.cs #- SerialiserFactoryImpl10.cs #- CharsetEncodings.cs
def atEnd(self):
"default": 432, } self._mm.mods(type="translator"), ) self._mm.mods(type="wordsStringParser"), self._mm.mods(type="mimicryTypefaceConverter"), )
def _convertMimicryTypeface(self):
def _parse(self):
except IndexError: _, ngettext = str, lambda a, b, n: a if n == 1 else b else: self._mm.resourcePath("translations") ) #TRANSLATORS: This is one of the file formats OpenTeacher #TRANSLATORS: can read. It's named after the program that uses #TRANSLATORS: it. See http://www.oriente-voca.eu/ for more info #TRANSLATORS: on it.
except IndexError: pass else:
if hasattr(self, "_reader"): # pragma: no cover del self._reader
#this is found directly after the header #(name + version numbers) #note this function only parses one of the two blocks at once. # # 4 foreign language name size (int) # * foreign language name (UTF-8 string) # 4 foreign font name size (int) # * foreign font name (UTF-8 string) # 4 foreign font size # # 4 reference language name size (int) # * reference language name (UTF-8 string) # 4 reference font name size (int) # * reference font name (UTF-8 string) # 4 reference font size
# 4 grammar item count (int) # ( # 4 category size (int) # * category (UTF-8 string) # 4 value size (int) # * value (UTF-8 string) # )*
# ( # (file path is an empty string) # 8 [ 0 ] (long) # | # (file path is not an empty string) # 8 file size (long) # 4 extension size (int) # * extension (UTF-8 string) # * file data (bytes with length of file size) # ) self._skipUtf8String() self._reader.skipBytes(fileSize)
#0xA is the header length
# 4 phonetic font name size (int) # * phonetic font name (UTF-8 string) # 4 phonetic font size # # 4 foreign characters size (int) # * foreign characters (UTF-8 string)
#skip it all
# 4 part of speech count (int) # ( # 4 part of speech size (int) # * part of speech (UTF-8 string) # 4 grammar category count (int) # ( # 4 grammar category size (int) # * grammar category (UTF-8 string) # )* # )*
#skip it all. Bit harder this time.
# 1 uploaded (bool) # ( # (if uploaded) # 4 upload name size (int) # * upload name (UTF-8 string) # 1 has foreign language ISO code (bool) # ( # (if has foreign language ISO code) # 3 foreign language ISO code (char[]) # )? # 1 has reference language ISO code (bool) # ( # (if has reference language ISO code) # 3 reference language ISO code (char[]) # )? # 4 course size (int) # * course (UTF-8 string) # 4 comments size (int) # * comments (UTF-8 string) # )?
#skip it all. Starting to be annoying... self._skipUtf8String() for i in range(2): hasLanguageIsoCode = self._reader.readBool() if hasLanguageIsoCode: self._reader.skipBytes(3) for i in range(2): self._skipUtf8String()
# 4 mastered score (1-10) (int) # 4 maximum mastered chance (0 for 0%, 100 for 100%) (int) # 4 exam count (int)
#skip all
# 4 active exercise type (int) # 4 exercise type count (int) # (
# ( # 1 false # 4 exercise type name size (int) # * exercise type name (UTF-8 string) # ) # | # ( # 1 true # 4 size of language pack path to exercise type name (int) # * language pack path to exercise type name (UTF-8 string) # )
#skip it
# 4 answer type size (int) # * answer type (UTF-8 string) # 4 exercise configuration count (int) # ( # ( # 1 false # 4 exercise configuration name size (int) # * exercise configuration name (UTF-8 string) # ) # | # ( # 1 true # 4 size of language pack path to exercise configuration name (int) # * language pack path to exercise configuration name (UTF-8 string) # )
# 4 question field count (int) # ( # 4 question field size (int) # * question field (UTF-8 string) # )* # 4 answer field count (int) # ( # 1 check answer field (bool) # 4 answer field size (int) # * answer field (UTF-8 string) # )* # 4 info field count (int) # ( # 4 info field size (int) # * info field (UTF-8 string) # )* # )* # )*
#Finally. Some interesting stuff again. # 4 item count (int) # ( # 4 sequence (int) # 4 word size (int) # * word (UTF-8 string) # 1 word score (byte) # 4 word trans size (int) # * word trans (UTF-8 string) # 1 word trans score (byte) # 4 context size (int) # * context (UTF-8 string) # 1 context score (byte) # 4 phonetic size (int) # * phonetic (UTF-8 string) # 1 phonetic score (byte) # 4 part of speech size (int) # * part of speech (UTF-8 string) # * grammar (see GrammarSerialiser) # 1 grammar score (byte) # * media (see EmbeddedFileSerialiser) # 1 media score (byte) # * image (see EmbeddedFileSerialiser) -- ATTENTION: This is 4.0 only # 1 image score (byte) -- ATTENTION: This is 4.0 only # 4 lesson size (int) # * lesson (UTF-8 string) # 4 page size (int) # S* page (UTF-8 string)
#convert mimicry font symbols
"id": wordId, "questions": self._parse(question), "answers": self._parse(answer), }) # )*
"items": [], "resources": [], }
#We've got all the interesting stuff by now. Just in case I #missed something, the remaining part of the file format #description:
# Reads or writes a global view state in the following format: # # 1 mode (MainForm.Mode) # 1 apply filters (bool) # 4 filter count (int) # ( # 1 filter class (0 = StringFilter, # 1 = BooleanFilter, # 2 = StringListFilter) # 4 type size (int) # * type (UTF-8 string) # ( # (StringFilter) # 1 search type (StringFilter.SearchTypes) # 4 string size (int) # * string (UTF-8 string) # | # (BooleanFilter) # 1 converter type (0 = EmbeddedFileBooleanConverter) # 1 value (bool) # | # (StringListFilter) # 4 string count (int) # ( # 4 string size (int) # * string (UTF-8 string) # )* # ) # )* # 4 applied filter count (int) # ( # 4 string size (int) # * applied filter type (UTF-8 string) # )* # 4 unsatisfied filter count (int) # ( # 4 string size (int) # * unsatisfied filter type (UTF-8 string) # )* # 4 dirty indices count (int) # ( # 4 dirty index (int) # )* # 4 column count (int) # ( # 4 type size # * type (UTF-8 string) # 1 visible (bool) # 4 width (int) # )* # 1 selection type (Selection.SelectionType) # ( # (CellSelection) # 4 row (int) # 4 column (int) # | # (RowSelection) # 4 first row (int) # 4 last row (int) # )? # 4 top row (int) # 4 left column (int) # 1 has editor view state (bool) # ( # 1 editor view state class (0 = DefaultViewState, 1 = TextViewState) # ( # (TextViewState) # 4 selection start (int) # 4 selection length (int) # )? # )? # ( # 1 input language set (bool) # ( # (if input language set) # 4 culture size (int) # * culture (UTF-8 string) # 4 layout size (int) # * layout (UTF-8 string) # )? # ){3} (for input modes: foreign, reference, phonetic)
"items": [], "resources": [], }
"items": [], "resources": [], }
# 22 [ 2E F6 26 4F 25 BD 2C 59 03 8F 59 AA 1E 34 46 07 6C 4A 44 CF 28 00 ] #check magic id
# * foreign language (null-terminated string) # 1 foreign font charset (charset) # * foreign font name (null-terminated string) # 1 foreign font size (byte) # * reference language (null-terminated string) # 1 reference font charset (charset) # * reference font name (null-terminated string) # 1 reference font size (byte)
# * foreign characters (null-terminated string)
# 2 [ 0D 00 ]
# The format of a list of items is as follows. The end of the list # should correspond to the end of the stream. # # ( # * word (null-terminated string) # * word trans (null-terminated string) # * context (null-terminated string) # * part of speech (null-terminated string) # * lesson (null-terminated string) # 4 sound size (int) # * sound (bytes) # 1 foreign to reference set (exercise level) # 1 reference to foreign set (exercise level) # 1 foreign to reference progress (byte, max 30) # 1 reference to foreign progress (byte, max 30) # 1 foreign to reference asked (byte, max 5) # 1 reference to foreign asked (byte, max 5) # 1 foreign to reference score (byte, 5 low order bits are answers to last questions) # 1 reference to foreign score (byte, 5 low order bits are answers to last questions) # 2 [ 0D 00 ] # )*
#convert mimicry typeface chars into unicode
"id": next(counter), "questions": self._parse(question), "answers": self._parse(answer), })
#public const byte ANSI_CHARSET = 0; #public const byte DEFAULT_CHARSET = 1; #public const byte SHIFTJIS_CHARSET = 128; #public const byte HANGEUL_CHARSET = 129; #public const byte GB2312_CHARSET = 134; #public const byte CHINESEBIG5_CHARSET = 136; #public const byte JOHAB_CHARSET = 130; #public const byte HEBREW_CHARSET = 177; #public const byte ARABIC_CHARSET = 178; #public const byte GREEK_CHARSET = 161; #public const byte TURKISH_CHARSET = 162; #public const byte VIETNAMESE_CHARSET = 163; #public const byte THAI_CHARSET = 222; #public const byte EASTEUROPE_CHARSET = 238; #public const byte RUSSIAN_CHARSET = 204; #public const byte BALTIC_CHARSET = 186;
#case DEFAULT_CHARSET: #case ANSI_CHARSET: # return Encoding.GetEncoding("windows-1252"); #case EASTEUROPE_CHARSET: # return Encoding.GetEncoding("windows-1250"); #case BALTIC_CHARSET: # return Encoding.GetEncoding("windows-1257"); #case RUSSIAN_CHARSET: # return Encoding.GetEncoding("windows-1251"); #case GREEK_CHARSET: # return Encoding.GetEncoding("windows-1253"); #case TURKISH_CHARSET: # return Encoding.GetEncoding("windows-1254"); #case ARABIC_CHARSET: # return Encoding.GetEncoding("windows-1256"); #case HEBREW_CHARSET: # return Encoding.GetEncoding("windows-1255"); #case THAI_CHARSET: # return Encoding.GetEncoding("windows-874"); #case VIETNAMESE_CHARSET: # return Encoding.GetEncoding("windows-1258"); #case GB2312_CHARSET: # return Encoding.GetEncoding("gb2312"); #case CHINESEBIG5_CHARSET: # return Encoding.GetEncoding("big5"); #case SHIFTJIS_CHARSET: # return Encoding.GetEncoding("shift_jis"); #case HANGEUL_CHARSET: # return Encoding.GetEncoding("ks_c_5601-1987"); #case JOHAB_CHARSET: # return Encoding.GetEncoding("Johab"); #default: # return Encoding.GetEncoding("windows-1252");
0: "windows-1252", 1: "windows-1252", 128: "shift_jis", 129: "ks_c_5601-1987", 134: "gb2312", 136: "big5", 130: "Johab", 177: "windows-1255", 178: "windows-1256", 161: "windows-1253", 162: "windows-1254", 163: "windows-1258", 222: "windows-874", 238: "windows-1250", 204: "windows-1251", 186: "windows-1257", }[code]
#header: # # 8 VOCAWRDL # 1 major version (4) # 1 minor version (0)
#used by _parse40() and _parse30() (and methods they call) else: # pragma: no cover raise ValueError("Unknown file format version") else: #give vocatude a chance on reading the file
"resources": {}, "list": list, }
|