anki/oldanki/media.py
2013-01-08 09:43:29 +09:00

287 lines
9.5 KiB
Python

# -*- coding: utf-8 -*-
# Copyright: Damien Elmes <oldanki@ichi2.net>
# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html
"""\
Media support
====================
"""
__docformat__ = 'restructuredtext'
import os, shutil, re, urllib2, time, tempfile, unicodedata, urllib
from oldanki.db import *
from oldanki.utils import checksum, genID
from oldanki.lang import _
# other code depends on this order, so don't reorder
regexps = ("(?i)(\[sound:([^]]+)\])",
"(?i)(<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>)")
# Tables
##########################################################################
mediaTable = Table(
'media', metadata,
Column('id', Integer, primary_key=True, nullable=False),
Column('filename', UnicodeText, nullable=False),
# reused as reference count
Column('size', Integer, nullable=False),
# treated as modification date, not creation date
Column('created', Float, nullable=False),
# reused as md5sum. empty string if file doesn't exist on disk
Column('originalPath', UnicodeText, nullable=False, default=u""),
# older versions stored original filename here, so we'll leave it for now
# in case we add a feature to rename media back to its original name. in
# the future we may want to zero this to save space
Column('description', UnicodeText, nullable=False, default=u""))
class Media(object):
pass
mapper(Media, mediaTable)
mediaDeletedTable = Table(
'mediaDeleted', metadata,
Column('mediaId', Integer, ForeignKey("cards.id"),
nullable=False),
Column('deletedTime', Float, nullable=False))
# File handling
##########################################################################
def copyToMedia(deck, path):
"""Copy PATH to MEDIADIR, and return new filename.
If a file with the same md5sum exists in the DB, return that.
If a file with the same name exists, return a unique name.
This does not modify the media table."""
# see if have duplicate contents
newpath = deck.s.scalar(
"select filename from media where originalPath = :cs",
cs=checksum(open(path, "rb").read()))
# check if this filename already exists
if not newpath:
base = os.path.basename(path)
mdir = deck.mediaDir(create=True)
newpath = uniquePath(mdir, base)
shutil.copy2(path, newpath)
return os.path.basename(newpath)
def uniquePath(dir, base):
# remove any dangerous characters
base = re.sub(r"[][<>:/\\&]", "", base)
# find a unique name
(root, ext) = os.path.splitext(base)
def repl(match):
n = int(match.group(1))
return " (%d)" % (n+1)
while True:
path = os.path.join(dir, root + ext)
if not os.path.exists(path):
break
reg = " \((\d+)\)$"
if not re.search(reg, root):
root = root + " (1)"
else:
root = re.sub(reg, repl, root)
return path
# DB routines
##########################################################################
def updateMediaCount(deck, file, count=1):
mdir = deck.mediaDir()
if deck.s.scalar(
"select 1 from media where filename = :file", file=file):
deck.s.statement(
"update media set size = size + :c, created = :t where filename = :file",
file=file, c=count, t=time.time())
elif count > 0:
try:
sum = unicode(
checksum(open(os.path.join(mdir, file), "rb").read()))
except:
sum = u""
deck.s.statement("""
insert into media (id, filename, size, created, originalPath, description)
values (:id, :file, :c, :mod, :sum, '')""",
id=genID(), file=file, c=count, mod=time.time(),
sum=sum)
def removeUnusedMedia(deck):
ids = deck.s.column0("select id from media where size = 0")
for id in ids:
deck.s.statement("insert into mediaDeleted values (:id, :t)",
id=id, t=time.time())
deck.s.statement("delete from media where size = 0")
# String manipulation
##########################################################################
def mediaFiles(string, remote=False):
l = []
for reg in regexps:
for (full, fname) in re.findall(reg, string):
isLocal = not re.match("(https?|ftp)://", fname.lower())
if not remote and isLocal:
l.append(fname)
elif remote and not isLocal:
l.append(fname)
return l
def stripMedia(txt):
for reg in regexps:
txt = re.sub(reg, "", txt)
return txt
def escapeImages(string):
def repl(match):
tag = match.group(1)
fname = match.group(2)
if re.match("(https?|ftp)://", fname):
return tag
return tag.replace(
fname, urllib.quote(fname.encode("utf-8")))
return re.sub(regexps[1], repl, string)
# Rebuilding DB
##########################################################################
def rebuildMediaDir(deck, delete=False, dirty=True):
mdir = deck.mediaDir()
if not mdir:
return (0, 0)
deck.startProgress(title=_("Check Media DB"))
# set all ref counts to 0
deck.s.statement("update media set size = 0")
# look through cards for media references
refs = {}
normrefs = {}
def norm(s):
if isinstance(s, unicode):
return unicodedata.normalize('NFD', s)
return s
for (question, answer) in deck.s.all(
"select question, answer from cards"):
for txt in (question, answer):
for f in mediaFiles(txt):
if f in refs:
refs[f] += 1
else:
refs[f] = 1
normrefs[norm(f)] = True
# update ref counts
for (file, count) in refs.items():
updateMediaCount(deck, file, count)
# find unused media
unused = []
for file in os.listdir(mdir):
path = os.path.join(mdir, file)
if not os.path.isfile(path):
# ignore directories
continue
nfile = norm(file)
if nfile not in normrefs:
unused.append(file)
# optionally delete
if delete:
for f in unused:
path = os.path.join(mdir, f)
os.unlink(path)
# remove entries in db for unused media
removeUnusedMedia(deck)
# check md5s are up to date
update = []
for (file, created, md5) in deck.s.all(
"select filename, created, originalPath from media"):
path = os.path.join(mdir, file)
if not os.path.exists(path):
if md5:
update.append({'f':file, 'sum':u"", 'c':time.time()})
else:
sum = unicode(
checksum(open(os.path.join(mdir, file), "rb").read()))
if md5 != sum:
update.append({'f':file, 'sum':sum, 'c':time.time()})
if update:
deck.s.statements("""
update media set originalPath = :sum, created = :c where filename = :f""",
update)
# update deck and get return info
if dirty:
deck.flushMod()
nohave = deck.s.column0("select filename from media where originalPath = ''")
deck.finishProgress()
return (nohave, unused)
# Download missing
##########################################################################
def downloadMissing(deck):
urlbase = deck.getVar("mediaURL")
if not urlbase:
return None
mdir = deck.mediaDir(create=True)
deck.startProgress()
missing = 0
grabbed = 0
for c, (f, sum) in enumerate(deck.s.all(
"select filename, originalPath from media")):
path = os.path.join(mdir, f)
if not os.path.exists(path):
try:
rpath = urlbase + f
url = urllib2.urlopen(rpath)
open(f, "wb").write(url.read())
grabbed += 1
except:
if sum:
# the file is supposed to exist
deck.finishProgress()
return (False, rpath)
else:
# ignore and keep going
missing += 1
deck.updateProgress(label=_("File %d...") % (grabbed+missing))
deck.finishProgress()
return (True, grabbed, missing)
# Convert remote links to local ones
##########################################################################
def downloadRemote(deck):
mdir = deck.mediaDir(create=True)
refs = {}
deck.startProgress()
for (question, answer) in deck.s.all(
"select question, answer from cards"):
for txt in (question, answer):
for f in mediaFiles(txt, remote=True):
refs[f] = True
tmpdir = tempfile.mkdtemp(prefix="oldanki")
failed = []
passed = []
for c, link in enumerate(refs.keys()):
try:
path = os.path.join(tmpdir, os.path.basename(link))
url = urllib2.urlopen(link)
open(path, "wb").write(url.read())
newpath = copyToMedia(deck, path)
passed.append([link, newpath])
except:
failed.append(link)
deck.updateProgress(label=_("Download %d...") % c)
for (url, name) in passed:
deck.s.statement(
"update fields set value = replace(value, :url, :name)",
url=url, name=name)
deck.updateProgress(label=_("Updating references..."))
deck.updateProgress(label=_("Updating cards..."))
# rebuild entire q/a cache
for m in deck.models:
deck.updateCardsFromModel(m, dirty=True)
deck.finishProgress()
deck.flushMod()
return (passed, failed)