PDF页面重整脚本

之前写的PDF切边脚本效果并不理想,今天想到个新办法。先上效果图:

右边是原页面,左边是重整后的页面。
原理就是把页面切成一块块,然后重新排布,使其宽度尽量小,就更适合在kindle上阅读了

最后是代码,就两个函数。pdf和图像间互相转化的方法,在开始提到的那个文章里有

#!/usr/bin/env python2
from PIL import Image
 
continue_lower_bound = 20
piece_min_width = 1
piece_min_height = 1
top, right, bottom, left = 3, 3, 3, 3
 
def split(im, vertically = True):
  ary = im.load()
  width, height = im.size
  last_slice = None
  regions = []
  start = None
  for main_dimension in range(height if vertically else width):
    if vertically:
      current_slice = [ary[x, main_dimension] for x in range(width)]
    else:
      current_slice = [ary[main_dimension, y] for y in range(height)]
    if last_slice:
      distance = len([t for t in zip(current_slice, last_slice)
        if abs(t[0] - t[1]) > 0
        ])
      if distance > 0:
        if not start:
          start = main_dimension
      elif start and len([x for x in current_slice if x < 255]) < 3:
        if regions and start - regions[-1][1] < continue_lower_bound:
          regions[-1] = (regions[-1][0], main_dimension - 1)
        else:
          regions.append((start, main_dimension - 1))
        start = None
    last_slice = current_slice
 
  pieces = []
  for region in regions:
    start, end = region
    if vertically:
      piece = im.crop((0, start, width, end))
    else:
      piece = im.crop((start, 0, end, height))
    pwidth, pheight = piece.size
    if pwidth >= piece_min_width and pheight >= piece_min_height:
      pieces.append(piece)
 
  return pieces
 
def reform(f, outname = None):
  print f
  im = Image.open(f)
  if not outname:
    outname = f + '.out.' + im.format
  pieces = split(im, True)
  pieces = [split(piece, False) for piece in pieces]
  pieces = [piece for piece in pieces if piece]
  image_width = 0
  image_height = 0
  for hpieces in pieces:
    width = (len(hpieces) - 1) * continue_lower_bound + sum(p.size[0] for p in hpieces)
    if width > image_width:
      image_width = width
    height = max(p.size[1] for p in hpieces)
    image_height += height
  if pieces:
    image_height += (len(pieces) - 1) * continue_lower_bound
 
  image_width += left + right
  image_height += top + bottom
 
  out = Image.new("L", (image_width, image_height), 255)
  y = top
  for hpieces in pieces:
    x = left
    for piece in hpieces:
      out.paste(piece, (x, y))
      x += piece.size[0] + continue_lower_bound
    height = max(p.size[1] for p in hpieces)
    y += height + continue_lower_bound
 
  out.save(outname)
 
def main():
  import sys
  f = sys.argv[1]
  reform(f)
 
if __name__ == '__main__':
  main()

generator-based python coroutine

https://bitbucket.org/reus/corpy/src这个的改进版,Queue和Counter都可以换成其他的实现,保证线程安全就行。yield命令也可以用decorator自定义。send和recv未实现。

# coding: utf8
 
import Queue
import multiprocessing
import ctypes
from types import *
import sys
import time
import thread
import threading
 
class TheQueue(Queue.Queue):
  def pop(self):
    try:
      return self.get_nowait()
    except Queue.Empty:
      return None
 
  def push(self, value):
    self.put(value)
 
  def empty(self):
    return Queue.Queue.empty(self)
 
class Counter:
  def __init__(self):
    self.counter = multiprocessing.Value(ctypes.c_uint, 0)
 
  def inc(self):
    self.counter.acquire()
    self.counter.value += 1
    self.counter.release()
 
  def dec(self):
    self.counter.acquire()
    self.counter.value -= 1
    self.counter.release()
    State.active.acquire()
    State.active.notify()
    State.active.release()
 
class State:
  processes = TheQueue()
  message_boxes = {}
  box_listeners = {}
  counters = {}
  yield_handlers = {}
  result = None
  active = threading.Condition()
 
  @classmethod
  def reset(cls):
    cls.processes = TheQueue()
    cls.message_boxes = {}
    cls.box_listeners = {}
    for counter in cls.counters:
      cls.counters[counter] = Counter()
    cls.result = None
    cls.active = threading.Condition()
 
  @classmethod
  def counters_not_empty(cls):
    for counter in cls.counters:
      if cls.counters[counter].counter.value > 0:
        return True
    return False
 
def spawn(process, *args):
  State.processes.push((process, args))
  State.active.acquire()
  State.active.notify()
  State.active.release()
 
def run(*processes):
  for process in processes:
    if isinstance(process, tuple):
      spawn(*process)
    else:
      spawn(process)
  try:
    _run()
  except StopSignal: pass
  finally:
    ret = State.result
    State.reset()
  return ret
 
def _run():
  while State.counters_not_empty() or not State.processes.empty():
    process = State.processes.pop()
    if not process: 
      State.active.acquire()
      State.active.wait(10**10)
      State.active.release()
      continue
    process, args = process
    if callable(process):
      ret = process(*args)
      if isinstance(ret, GeneratorType):
        spawn(ret)
    elif isinstance(process, GeneratorType):
      try: 
        if args:
          yield_info = process.send(*args)
        else:
          yield_info = process.next()
      except StopIteration: 
        continue
      if yield_info is None:
        spawn(process)
      else:
        yield_command = yield_info[0]
        try:
          handler = State.yield_handlers[yield_command]
        except KeyError:
          print 'Error: No handler for yield command %s' % yield_command
          sys.exit()
        handler(process, yield_info[1:])
 
class StopSignal: pass
def stop(value):
  State.result = value
  raise StopSignal()
 
def yield_handler(yield_command, options = {}):
  if 'counter' in options:
    if options['counter'] not in State.counters:
      State.counters[options['counter']] = Counter()
  def _decorator(func):
    State.yield_handlers[yield_command] = func
    return func
  return _decorator
 
@yield_handler('spawn')
def _yield_spawn(process, args):
  spawn(*args)
  spawn(process)
 
@yield_handler('sleep', {'counter': 'sleep'})
def _yield_sleep(process, args):
  def _sleep():
    time.sleep(args[0])
    spawn(process)
    State.counters['sleep'].dec()
  State.counters['sleep'].inc()
  thread.start_new_thread(_sleep, tuple())
 
@yield_handler('pmap', {'counter': 'pmap'})
def _yield_pmap(process, args):
  if len(args) == 4:
    pool_size, func, iterable, timeout = args
  elif len(args) == 3:
    pool_size, func, iterable = args
    timeout = 10**10
  def _f():
    ret = [None for x in iterable]
    semaphore = threading.Semaphore(pool_size)
    def _p(arg, position):
      ret[position] = func(arg)
      semaphore.release()
    threads = []
    for arg in enumerate(iterable):
      semaphore.acquire()
      position, arg = arg
      thread = threading.Thread(target = _p, args = (arg, position))
      thread.start()
      threads.append(thread)
    start_time = time.time()
    for thread in threads:
      t = timeout - (time.time() - start_time)
      if t <= 0:
        break
      thread.join(t)
    spawn(process, ret)
    State.counters['pmap'].dec()
  State.counters['pmap'].inc()
  thread.start_new_thread(_f, tuple())

例子:

  def fac(n):
    return run((_fac, n, 1))
  def _fac(n, acc):
    if n > 1:
      spawn(_fac, n - 1, acc * n)
    else:
      stop(acc)
 
  print fac(3000)

例子:

  def foo():
    print 'foo'
    yield 'spawn', bar, 'hello', 'world'
    yield 'sleep', 1
    print 'foo'
 
  def bar(arg1, arg2):
    print arg1
    yield
    print arg2
    spawn(baz)
 
  @yield_handler('baz')
  def _baz(process, args):
    spawn(process, '-'.join(args))
 
  def baz():
    ret = yield 'baz', 'B', 'A', 'Z'
    print ret
 
  run(foo)

输出:

foo
hello
world
B-A-Z
foo

例子:

  import urllib2
  def foo():
    ret = yield 'pmap', 2, bar, [
      'http://qq.com',
      'http://baidu.com',
      'http://sina.com',
      'http://163.com',
      'http://sohu.com',
      ]
    print ret
  def bar(arg):
    return len(urllib2.urlopen(arg).read())
  def baz():
    for i in xrange(5):
      print i
      yield 'sleep', 1
  run(foo, baz)

输出:

0
1
2
3
[260623, 81, 584666, 337219, 329979]
4

Ubuntu 10.04 chroot jail on Archlinux

http://mirrors.163.com/ubuntu/pool/main/d/debootstrap/下载debootstrap_*_all.deb

然后解压
mkdir deb
cd deb
ar -x ___.deb


tar xvjf data.tar.gz
vi ./usr/sbin/debootstrap
增加一行
DEBOOTSTRAP_DIR=”./usr/share/debootstrap”

建个目录放ubuntu的文件
mkdir ubuntu
./usr/sbin/debootstrap –arch amd64 lucid ubuntu/ http://mirrors.163.com/ubuntu
然后它就会开始下载ubuntu的文件了,等吧。

完成之后
mount –bind /proc ubuntu/proc
chroot ubuntu
就可以了

如果要在jail里访问外部的文件,可以用
mount –bind [host-dir] [jail-dir]

LXC on Archlinux

mount上cgroup

mkdir -p /cgroup
mount none -t cgroup /cgroup

安装lxc包

packer -S lxc

制作文件系统

pacman -S devtools
mkarchroot /lxc $PKGS

PKGS的值是:(或者可以更加精简的)

bash
binutils
bzip2
coreutils
diffutils
file
filesystem
findutils
gawk
gcc-libs
gettext
glibc
grep
gzip
heirloom-mailx
initscripts
iputils
less
libpipeline
licenses
logrotate
man-db
man-pages
pacman
procps
psmisc
sed
shadow
sysfsutils
syslog-ng
sysvinit
tar
texinfo
util-linux
wget
which
wpa_supplicant

创建设备节点

cd /lxc/dev
mknod -m 666 random c 1 8
mknod -m 666 urandom c 1 9
mkdir -m 755 pts
mkdir -m 1777 shm
mknod -m 666 tty c 5 0
mknod -m 666 tty0 c 4 0
mknod -m 666 tty1 c 4 1
mknod -m 666 tty2 c 4 2
mknod -m 666 full c 1 7
mknod -m 600 initctl p
mknod -m 666 ptmx c 5 2

容器配置文件

lxc.utsname = mylxc
lxc.mount = /lxc/lxc.fstab
lxc.rootfs = /lxc
lxc.tty = 1
 
lxc.network.type = macvlan
lxc.network.link = eth0
lxc.network.flags = up
lxc.network.ipv4 = 192.168.1.111/24
lxc.network.name = eth0

lxc.fstab

none /lxc/dev/pts devpts defaults 0 0
none /lxc/proc proc defaults 0 0
none /lxc/sys sysfs defaults 0 0
none /lxc/dev/shm tmpfs defaults 0 0

创建容器

lxc-create /lxc/lxc.conf mylxc

修改rc.sysinit

vi /lxc/etc/rc.sysinit
#!/bin/bash
rm -f $(find /var/run -name '*pid')
rm -f /var/lock/subsys/*
rm -f /etc/mtab
touch /etc/mtab
ip link set dev eth0 up
ip link set dev lo up
ip route add default via 192.168.1.1 dev eth0

把rc.conf中的DAEMONS中括号内的都删除
注释掉inittab中所有getty和login manager,除了tty1那行

启动

lxc-start -n mylxc

连接到终端

lxc-console -n mylxc

erlang环形基准测试练习

这是《Programming Erlang》的8.11习题二。

-module(looptest).
-export([start/2]).
 
start(N, M) ->
  Master = self(),
  Pids = lists:map(fun(_) ->
    spawn(fun() -> slave(Master) end) end,
    lists:seq(1, N)),
  statistics(wall_clock),
  lists:nth(1, Pids) ! {0, N * M, N, Pids},
  receive
    stop ->
      {_, T} = statistics(wall_clock), T
  end.
 
slave(Master) ->
  receive 
    {Max, Max, _, _} ->
      Master ! stop;
    {Current, Max, N, Pids} ->
      NextPid = lists:nth((Current + 1) rem N + 1, Pids),
      NextPid ! {Current + 1, Max, N, Pids},
      slave(Master)
  end.
 
main(_) ->
  io:format("~pms", [start(100, 200)]).

====== update ======
一个改进的版本。上面的版本传递的消息很大,效率比较低。下面的版本要快几倍

-module(ring).
-export([main/1]).
 
main(_) -> io:format("~p~n", [start(100, 200)]).
 
start(N, M) ->
  statistics(wall_clock),
  Pids = lists:map(fun(_) ->
    spawn(fun() -> wait_for_next_pid(M) end) end,
    lists:seq(1, N)),
  Master = self(),
  lists:map(fun(Index) ->
    lists:nth(Index, Pids) ! {next_pid, lists:nth(Index + 1, Pids), Master}
    end, lists:seq(1, length(Pids) - 1)),
  lists:last(Pids) ! {next_pid, hd(Pids), Master},
  hd(Pids) ! ping,
  wait_for_last(lists:last(Pids)),
  {_, T} = statistics(wall_clock),
  T.
 
wait_for_next_pid(MaxLife) ->
  receive 
    {next_pid, Pid, Master} -> 
      wait_for_ping(Pid, MaxLife, Master)
  end.
 
wait_for_ping(NextPid, MaxLife, Master) when MaxLife > 0 ->
  receive 
    ping -> 
      NextPid ! ping,
      wait_for_ping(NextPid, MaxLife - 1, Master)
  end;
wait_for_ping(_, 0, Master) -> 
  Master ! {stop, self()}.
 
wait_for_last(LastPid) ->
  receive
    {stop, LastPid} -> void
  end.

[python]pdf自动切边

#!/usr/bin/env python2
# coding: utf8
 
from PIL import Image, ImageFilter, ImageChops, ImageDraw
import glob
from multiprocessing import Pool
import sys
import os
import hashlib
from shutil import rmtree
 
def main():
  if len(sys.argv) < 3:
    print 'usage: %s input_file output_file [edge_ratio]' % sys.argv[0]
    sys.exit()
  inputfile = sys.argv[1]
  outputfile = sys.argv[2]
  if inputfile == outputfile:
    print 'cannot the same'
    sys.exit()
  tmp_dir = '/tmp/%s' % hashlib.md5(inputfile).hexdigest()
  if os.path.exists(tmp_dir):
    rmtree(tmp_dir)
  os.mkdir(tmp_dir)
  os.system('gs -sDEVICE=pnggray -r150 -dGraphicsAlphaBits=4 -dTextAlphaBits=4 -dDOINTERPOLATE -sOutputFile=%s/%%d.png -dSAFER -dBATCH -dNOPAUSE %s' % (tmp_dir, inputfile))
 
  pool = Pool(10)
  pool.map(crop, glob.glob('%s/*.png' % tmp_dir))
 
  os.system('for f in `ls %s/*.png`; do convert $f $f.pdf; echo $f; done' % tmp_dir)
  os.system('gs -sDEVICE=pdfwrite -sOutputFile=%s -dBATCH -dNOPAUSE `ls %s/*.png.pdf|sort -n -t "/" -k 4`' % (
    outputfile, tmp_dir))
 
def foo(im):
  draw = ImageDraw.Draw(im)
  width, height = im.size
  if len(sys.argv) > 3:
    ratio = float(sys.argv[3])
  else:
    ratio = 0.05
 
  matrix = im.load()
  def edge(x, y, x_, y_): # 处理边缘部分
    n = 0
    for xx in range(x, x_):
      for yy in range(y, y_):
        if matrix[xx, yy] > 196:
          n += 1
    total = (x_ - x) * (y_ - y)
    rate = float(n) / total
    if rate > 0.9:
      draw.rectangle((x, y, x_, y_), 255)
  edge(0, 0, width, int(height * ratio))
  edge(0, 0, int(width * ratio), height)
  edge(int(width * (1 - ratio)), 0, width, height)
  edge(0, int(height * (1 - ratio)), width, height)
 
  for j in range(3):
    im = im.filter(ImageFilter.MedianFilter)
  for k in range(3):
    im = im.filter(ImageFilter.MinFilter)
  return im
 
def crop(f):
  print 'processing %s' % f
  im = Image.open(f)
  im = foo(im)
  im = im.convert('1')
  bg = Image.new('1', im.size, 255)
  diff = ImageChops.difference(im, bg)
  bbox = diff.getbbox()
  im = Image.open(f)
  im = im.crop(bbox)
  im.save(f, 'png')
  print 'processed %s' % f
 
if __name__ == '__main__':
  main()

pattern matcher

# coding: utf8
from parse import Parser, Node
hierarchy = '''
中书省 丞相 平章政事 中书省丞 参知政事 郎中 员外郎
内阁 中极殿大学士 建极殿大学士 文华殿大学士 武英殿大学士 文渊阁大学士 东阁大学士
六部
  吏部 吏部尚书 吏部侍郎 吏部郎中 吏部员外郎 吏部主事
  户部 户部尚书 户部侍郎 户部郎中 户部员外郎 户部主事
  礼部 礼部尚书 礼部侍郎 礼部郎中 礼部员外郎 礼部主事
    僧录司善世
    僧录司阐教
    道录司正一
    道录司演法
    龙虎山正一真人
  兵部 兵部尚书 兵部侍郎 兵部郎中 兵部员外郎 兵部主事
  刑部 刑部尚书 刑部侍郎 刑部郎中 刑部员外郎 刑部主事
  工部 工部尚书 工部侍郎 工部郎中 工部员外郎 工部主事
五寺
  大理寺 大理寺卿 大理寺少卿 大理寺丞 大理寺正
  太常寺 太常寺卿 太常寺少卿 太常寺丞
  光禄寺 光禄寺卿 光禄寺少卿 光禄寺丞
  太仆寺 太仆寺卿 太仆寺少卿 太仆寺丞
  鸿胪寺 鸿胪寺卿 鸿胪寺少卿 鸿胪寺丞
都察院 都御史 副都御史 金都御史 经历
通政司 通政使 通政 参议
翰林院 翰林院学士 侍读学士 侍讲学士 侍读 侍讲 修撰
国子监 祭酒 司业 衍圣公
钦天监 监正 监副 官正
'''
tree = Parser(hierarchy).root
matches = tree.match(
  '.*',
  '.*',
  '六部 *a *b *c *d *e *f',
  '*temple/.* .*, *foo **res1',
  '**res2',
)
for k in matches:
  v = matches[k]
  print '=' * 15, k, '=' * 15
  if isinstance(v, Node):
    print v.dump()
  elif isinstance(v, list):
    print ('\n'+'-'*30+'\n').join(x.dump() for x in v)

输出:

=============== a ===============
吏部
    吏部尚书
    吏部侍郎
    吏部郎中
    吏部员外郎
    吏部主事
=============== c ===============
礼部
    礼部尚书
    礼部侍郎
    礼部郎中
    礼部员外郎
    礼部主事
    僧录司善世
    僧录司阐教
    道录司正一
    道录司演法
    龙虎山正一真人
=============== b ===============
户部
    户部尚书
    户部侍郎
    户部郎中
    户部员外郎
    户部主事
=============== e ===============
刑部
    刑部尚书
    刑部侍郎
    刑部郎中
    刑部员外郎
    刑部主事
=============== d ===============
兵部
    兵部尚书
    兵部侍郎
    兵部郎中
    兵部员外郎
    兵部主事
=============== f ===============
工部
    工部尚书
    工部侍郎
    工部郎中
    工部员外郎
    工部主事
=============== res2 ===============
都察院
    都御史
    副都御史
    金都御史
    经历
------------------------------
通政司
    通政使
    通政
    参议
------------------------------
翰林院
    翰林院学士
    侍读学士
    侍讲学士
    侍读
    侍讲
    修撰
------------------------------
国子监
    祭酒
    司业
    衍圣公
------------------------------
钦天监
    监正
    监副
    官正
=============== foo ===============
大理寺卿
=============== res1 ===============
太常寺
    太常寺卿
    太常寺少卿
    太常寺丞
------------------------------
光禄寺
    光禄寺卿
    光禄寺少卿
    光禄寺丞
------------------------------
太仆寺
    太仆寺卿
    太仆寺少卿
    太仆寺丞
------------------------------
鸿胪寺
    鸿胪寺卿
    鸿胪寺少卿
    鸿胪寺丞
=============== temple ===============
五寺
    大理寺
        大理寺卿
        大理寺少卿
        大理寺丞
        大理寺正
    太常寺
        太常寺卿
        太常寺少卿
        太常寺丞
    光禄寺
        光禄寺卿
        光禄寺少卿
        光禄寺丞
    太仆寺
        太仆寺卿
        太仆寺少卿
        太仆寺丞
    鸿胪寺
        鸿胪寺卿
        鸿胪寺少卿
        鸿胪寺丞

[python]回调函数的参数传递

先看一段代码:

class Foo:
  def __init__(self, callbacks):
    self.callbacks = callbacks
    self.a, self.b, self.c, self.d = range(4)
 
  def hook(self, callback):
    self.callbacks.append(callback)
 
  def do_something(self):
    for c in self.callbacks:
      c(self.a, self.c, self.d, self.b)
 
def callback1(a, c, d, b):
  print 'calling callback1'
 
def callback2(a, c, d, b):
  print 'calling callback2'
 
def callback3(a, c, d, b):
  print 'calling callback3'
 
foo = Foo([callback1])
foo.hook(callback3)
foo.hook(callback2)
 
foo.do_something()

这是通过注册回调函数,来对系统的行为进行扩展的一个简化模型(也可以通过Foo.__dict__.update()来实现,同理,不过不能保持顺序)。
比如要新注册一个函数,可以这样写:

def bar(a, c, d, b): pass
foo.hook(bar)

稍有常识的人都会看出,如果我们的回调函数继续添加,它的形参难道可以不是四个吗?
不能,因为在do_something里面已经写死了,传递了四个实参,且是a, c, d, b的顺序
这样一来,如果想要改变实参传递的顺序,比如改成(a, b, c, d),那就需要改变所有回调函数的形参,这显然很麻烦
这样的情况其实很少,除非设计失误然后又有强烈的洁癖,才有可能实行这样的重构
更常见的情况是,一个回调函数需要a, b, c, d以外的参数,这时候就不得不去修改了:

def sob_callback(a, b, c, d, e): pass
foo.hook(sob_callback)

要使此回调函数工作,需要将callback1, callback2, callback3和foo的参数表修改成同样的(a, b, c, d, e)
且需要将do_something里面的调用语句改成:c(a, b, c, d, e)
少数几个函数还好说,要是有几十上百或者更多个,又没有自动化的重构工具的帮助,很可能就会放弃实现sob_callback了。

解决方法是,将需要传递的参数全部放在一个dict里面:

class Foo:
  ......
  def do_something(self):
    for c in self.callbacks:
      c({
        'a': self.a,
        'b': self.b,
        'c': self.c,
        'd': self.d,
      })
 
def callback1(args):
  ......
......

这样一来,需要传递新的参数的话,就不用修改回调函数的签名了,只需要往传递的dict里面放就行。
当然,在回调函数里面使用参数时也稍微麻烦了点

还有另外一种方法:

import inspect
class Foo:
  ......
  def do_something(self):
    for c in self.callbacks:
      c(*[getattr(self, x) for x in inspect.getargspec(c)[0]])
 
def callback1(a, b, c):
  print a, b, c
 
def callback2(c, b, a):
  print a, b, c
 
def callback3(b, c):
  print b, c
......

这个实现方式里面,回调函数获得的实参,是各不一样的,参数数量都可以不一样
而且,无论怎样扩展,都不需要改变do_something的代码(或者可以将getattr(self, x)改成其他更灵活的东西,比如locals()[x]
回调函数需要什么,就在参数表里列出来,就会得到且只会得到所需的,调用环境里的值
我比较喜欢这种方式,打算给现在的一个系统重构下

最近在做一个php代码生成器,也可以算是个框架吧。用python实现的。
不仅仅可以输出php代码,输出ruby,lua之类的也是可以的,跟语言平台有关的代码,其实都是用扩展的方式实现
不同的后端,所用的回调函数集不同
大致可用了,正着手写一个实际的系统,逐渐打磨完善之

http://www.fifm.cn/ 电台源地址采集脚本

真实地址被对称加密过,这个加密串还需要以一个数字做参数去post而得。不过它的js里面就有解密的函数,直接就转换成python的。python不支持i++这样的语法就是不美啊。

# coding: utf8
import urllib2
import urllib
import re
from multiprocessing import Pool, Value
from ctypes import c_int
 
global counter
counter = Value(c_int)
counter.value = 0
 
def go(page):
  print '=' * 30, page, '=' * 30
  url = 'http://www.fifm.cn/index%d.htm' % page
  pool = Pool(50)
  domestic = urllib2.urlopen(url).read()
 
  start = 0
  start_str = 'class="gn"'
  radios = []
  try:
    while True:
      start = domestic.index(start_str,
        start + len(start_str))
      cat = domestic[domestic.index('>', start)+1:domestic.index('<', start)]
      radios += [(cat, x) for x in re.findall('radio([0-9]+)\.htm.*?>([^<]*)', domestic[start:])]
  except ValueError, e:
    print e
    pass
  ret = map(process, radios)
  output = open('radio%d' % page, 'w')
  output.write('\n'.join(['\t'.join(x) for x in ret]))
  output.close()
 
def process(arg):
  cat, radio = arg
  global counter
  serial, name = radio
  req = urllib2.Request('http://www.fifm.cn/ch.ashx', data = urllib.urlencode({
    'v': 1,
    'i': serial,
    }), headers = { #必需,否则返回空数据
      'Referer': 'http://www.fifm.cn/r/radio%s.htm' % serial,
      'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:2.0.1) Gecko/20110429 Firefox/4.0.1',
    })
  retry = 10
  while True:
    try:
      url = urllib2.urlopen(req).read().split('\t')[0][2:]
      break
    except:
      retry -= 1
      if retry < 0: break
      continue
  counter.value += 1
  if counter.value % 10 == 0:
    print counter.value
  return (cat, name, decode(url).encode('utf8'))
 
def decode(hex_text): # 解密函数,页面js中的HexToUtf8函数
  s = ''
  y = 'welcome to fifm.cn'
  i = j = 0
  c = c1 = c2 = 0
  while i < len(hex_text):
    c = int(hex_text[i] + hex_text[i+1], 16) ^ ord(y[j])
    i += 2
    j += 1
    if j >= len(y):
      j = 0
    if c < 128:
      s += unichr(c)
    elif c > 191 and c < 224:
      c2 = int(hex_text[i] + hex_text[i+1], 16) ^ ord(y[j])
      i += 2
      j += 1
      if j > len(y):
        j = 0
      s += unichr(((c & 31) << 6 | (c2 & 63)))
    else:
      c2 = int(hex_text[i] + hex_text[i+1], 16) ^ ord(y[j])
      i += 2
      j += 1
      if j >= len(y):
        j = 0
      c3 = int(hex_text[i] + hex_text[i+1], 16) ^ ord(y[j])
      i += 2
      j += 1
      if j >= len(y):
        j = 0
      s += unichr(((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63))
  return s
 
def main():
  for url in range(2, 11):
    go(url)
 
if __name__ == '__main__':
  main()

PHP: html dsl translator

跟这个东西是一样作用的:http://tech.reus.me/?p=519。不好看,能用,有站点在用。聊作记录吧,想用的时候不用翻来翻去(之前有个系统的代码就不知不觉被我删掉了,备份都找不到……
这个写了很久了,今天写了另一个python的dsl parser,用该dsl来实现此功能可以更清晰一些

<?php
defined('ENTRANCE') or die('ACCESS DENIED');
require_once 'Twig/Autoloader.php';
Twig_Autoloader::register();
 
function tpl_tokenize($name, $base_indent = 0) {
  $tokens = array();
  $lines = explode("\n", file_get_contents(dirname(__FILE__) . '/../templates/' . $name));
  foreach ($lines as $l) {
    $l = rtrim($l);
    $indent = strlen($l) - strlen(ltrim($l)) + $base_indent;
    $l = trim($l);
    if (strlen($l) == 0) {
      continue;
    }
    else if ($l[0] == '+') { #include
      foreach (tpl_tokenize(substr(trim($l), 1), $indent) as $t) {
        $tokens[] = $t;
      }
    }
    else if ($l[0] == '-') { #comment
      continue;
    }
    else {
      $slices = explode(' / ', $l);
      for ($si = 0; $si < sizeof($slices); ++$si) {
        $inner_slices = explode(' | ', $slices[$si]);
        foreach ($inner_slices as $inner) {
          $tokens[] = array($indent + $si, $inner);
        }
      }
    }
  }
  return $tokens;
}
 
function tpl_parse(&$tokens) {
  $ret = array();
  $stack = array();
  $t = tpl_parse_token(array_shift($tokens));
  while ($t != NULL) {
    if ($t['type'] == 'nstt') {
      if (!$t['opened']) {
        $ret[] = str_repeat(' ', $t['indent']);
        $ret[] = '<' . $t['tag'];
        if (!empty($t['attr'])) {
          $ret[] = ' ' . $t['attr'];
        }
        if (in_array($t['tag'], array(
          'textarea',
          ))) {
          $ret[] = ">";
        }
        else {
          $ret[] = ">\n";
        }
        $t['opened'] = true;
      }
      if (empty($tokens)) { 
        tpl_print_close($t, $ret);
        tpl_print_stack($stack, 0, $ret);
        break;
      }
      $next = $tokens[0];
      if ($next[0] > $t['indent']) { #child
        array_push($stack, $t);
        $t = tpl_parse_token(array_shift($tokens));
        continue;
      }
      else if ($next[0] == $t['indent']) { #sibling
        tpl_print_close($t, $ret);
        $t = tpl_parse_token(array_shift($tokens));
        continue;
      }
      else { #father
        tpl_print_close($t, $ret);
        tpl_print_stack($stack, $next[0], $ret);
        $t = tpl_parse_token(array_shift($tokens));
        continue;
      }
    }
    else if ($t['type'] == 'stt') {
      $ret[] = str_repeat(' ', $t['indent']);
      $ret[] = '<'  .$t['tag'] . ' ' . $t['attr'] . " />\n";
      if (empty($tokens)) { 
        tpl_print_stack($stack, 0, $ret);
        break;
      }
      $next = $tokens[0];
      if ($next[0] > $t['indent']) { 
        die('template error'); #raw或者stt不能有子token
      }
      else if ($next[0] == $t['indent']) {
        $t = tpl_parse_token(array_shift($tokens));
        continue;
      }
      else {
        tpl_print_stack($stack, $next[0], $ret);
        $t = tpl_parse_token(array_shift($tokens));
        continue;
      }
    }
    else if ($t['type'] == 'raw' || $t['type'] == 'rawe') {
      $ret[] = str_repeat(' ', $t['indent']);
      $ret[] = $t['attr'] . "\n";
      if (empty($tokens)) { 
        tpl_print_stack($stack, 0, $ret);
        break;
      }
      $next = $tokens[0];
      if ($next[0] > $t['indent']) { 
        die('template error'); #raw或者stt不能有子token
      }
      else if ($next[0] == $t['indent']) {
        $t = tpl_parse_token(array_shift($tokens));
        continue;
      }
      else {
        tpl_print_stack($stack, $next[0], $ret);
        $t = tpl_parse_token(array_shift($tokens));
        continue;
      }
    }
  }
 
  $ret = implode('', $ret);
  return $ret;
}
 
function tpl_print_stack(&$stack, $indent, &$ret) {
  while (!empty($stack) && $stack[sizeof($stack)-1]['indent'] >= $indent) {
    tpl_print_close(array_pop($stack), $ret);
  }
}
 
function tpl_print_close($t, &$ret) {
  if (!in_array($t['tag'], array(
    'textarea',
    ))) {
    $ret[] = str_repeat(' ', $t['indent']);
  }
  $ret[] = '</' . $t['tag'] . '>';
  if (isset($t['id'])) {
    $ret[] = "<!-- /{$t['id']} -->\n";
  }
  else {
    $ret[] = "\n";
  }
};
 
function tpl_parse_token($t) {
  if ($t == NULL) {
    return NULL;
  }
  $ret = array(
    'indent' => $t[0],
    'opened' => false,
  );
  $tok = trim(strtok($t[1], " \t"));
  $ret['tag'] = $tok;
  $ret['type'] = token_type($tok);
  if ($ret['type'] == 'raw') {
    $ret['attr'] = $t[1];
  }
  else if ($ret['type'] == 'rawe') {
    $ret['attr'] = substr($t[1], 1);
  }
  else {
  $classes = array();
  $tag_id = null;
  $attr = '';
  while ($tok !== false) {
    $tok = strtok(" \t");
    if ($tok[0] == '.') {
      $classes[] = substr($tok, 1);
    }
    else if ($tok[0] == '#') {
      $tag_id = substr($tok, 1);
      $ret['id'] = $tag_id;
    }
    else {
      $attr .= ' ' . $tok;
    }
  }
  if (!empty($classes)) {
    $attr = 'class="' . implode(' ', $classes) . '" ' . $attr;
  }
  if ($tag_id != null) {
    $attr = 'id="' . $tag_id . '" ' . $attr;
  }
  $ret['attr'] = trim($attr);
  }
  return $ret;
}
 
function tpl_compile($s, $vars = array()) {
  static $loader = NULL;
  $loader == NULL && $loader = new Twig_Loader_String();
  static $twig = NULL;
  if ($twig == NULL) {
    $twig = new Twig_Environment($loader);
    $twig->addFilter('url', new Twig_Filter_Function('hook_url'));
  }
 
  $template = $twig->loadTemplate($s);
  $template->display($vars);
}
 
function tpl(&$context) {
  $name = $context['tpl_name'];
  $vars = $context['tpl_vars'];
  if (isset($context['warnings'])) {
    $vars['warnings'] = array_merge($vars['warnings'], $context['warnings']);
  }
  tpl_compile(tpl_parse(tpl_tokenize($name)), $vars);
}
 
function token_type($s) {
  $non_self_terminating_tags = array(
    'a', 'abbr', 'acronym', 'address',
    'b', 'bdo', 'big', 'blockquote', 'body', 'button',
    'caption', 'cite', 'code', 'colgroup', 
    'dd', 'del', 'dfn', 'div', 'dl', 'dt',
    'em', 'fieldset', 'form', 'frameset', 
    'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'html', 
    'i', 'iframe', 'ins', 'kbd', 'label', 'legend', 'li',
    'map', 'menu', 'noframes', 'noscript', 'object', 'ol', 'optgroup',
    'option', 'p', 'pre', 'q', 'samp', 'script', 'select', 'small', 
    'span', 'strong', 'style', 'sub', 'sup', 'table', 'tbody', 'td',
    'textarea', 'tfoot', 'th', 'thead', 'title', 'tr', 'tt', 'ul', 'var',
  );
  $self_terminating_tags = array(
    'area', 'base', 'br', 'col', 'frame', 'hr', 'img', 'input',
    'link', 'meta', 'param',
  );
  if ($s[0] == '=') {
    return 'rawe';
  }
  if (in_array($s, $non_self_terminating_tags)) {
    return 'nstt';
  }
  if (in_array($s, $self_terminating_tags)) {
    return 'stt';
  }
  return 'raw';
}