about:config
mouse
smooth
disable ipv6
|
||||||
|
about:config mouse 在做一个数据采集的东东,之前用redis,但是几天之后数据猛增到150M+,内存吃紧(在VPS里跑的),用vm的话负载会很高(不知何故),于是干脆用mysql做后端,写了个中间层。接口跟redis差不多,只不过各个数据类型是在不同的名字空间里(因为用不同的表来存),所以keys命令是各个类型分开的。现在完成的只有string和set,list因为懒惰而且现在也未需要用到所以一直是待完成状态。 # coding: utf8 import _mysql def quote(s): return _mysql.escape_string(s) import hashlib def hash(s): return hashlib.sha224(s).hexdigest() import random import time import sys _sql = { 'db': """ CREATE DATABASE %s DEFAULT CHARACTER SET = utf8 DEFAULT COLLATE = utf8_general_ci """, 'strings': """ CREATE TABLE strings ( name TEXT, hash CHAR(56), value TEXT, UNIQUE KEY (hash) ) ENGINE=MyISAM """, 'lists': """ CREATE TABLE lists ( name TEXT, value TEXT, weight INT(64) DEFAULT 0 ) ENGINE=MyISAM """, 'sets': """ CREATE TABLE sets ( name TEXT, nhash CHAR(56), value TEXT, vhash CHAR(56), UNIQUE INDEX hash (nhash,vhash) ) ENGINE=MyISAM """, } class Mydis: def __init__(self, host = 'localhost', user = 'test', password = 'test', database = 'test'): self.init_args = (host, user, password, database) self.connection = _mysql.connect(host, user, password) try: self.connection.select_db(database) except Exception, e: if e[0] == 1049: #no database print 'creating database %s' % database self.connection.query(_sql['db'] % database) self.connection.select_db(database) self.connection.query(_sql['strings']) self.connection.query(_sql['lists']) self.connection.query(_sql['sets']) else: print e sys.exit() self.connection.set_character_set('UTF8') self.connection.query('SET NAMES UTF8') # strings def get(self, name): self._keep_alive() sql = "SELECT value FROM strings WHERE hash='%s' LIMIT 1" % hash(name) return self._get_one(sql) def set(self, name, value): self._keep_alive() if type(value) != str: value = str(value) sql = "INSERT INTO strings VALUES ('%s', '%s', '%s')" % (quote(name), hash(name), quote(value)) try: self.connection.query(sql) except Exception, e: if e[0] == 1062: sql = "UPDATE strings SET value='%s' WHERE hash='%s'" % (quote(value), hash(name)) self.connection.query(sql) else: print e sys.exit() return True def setnx(self, name, value): self._keep_alive() if type(value) != str: value = str(value) sql = "INSERT INTO strings VALUES ('%s', '%s', '%s')" % (quote(name), hash(name), quote(value)) try: self.connection.query(sql) except Exception, e: if e[0] == 1062: pass else: print e sys.exit() return True def rem(self, name): self._keep_alive() sql = "DELETE FROM strings WHERE name='%s'" % quote(name) self.connection.query(sql) return True def keys(self, pattern): self._keep_alive() sql = "SELECT name FROM strings WHERE name LIKE '%s'" % quote(pattern) return self._get_col(sql) # lists def lpush(self, name, value): self._keep_alive() pass def rpush(self, name, value): self._keep_alive() pass def lpop(self, name): self._keep_alive() pass def rpop(self, name): self._keep_alive() pass def llen(self, name): self._keep_alive() pass def lrange(self, name, start, end): self._keep_alive() pass def lindex(self, name, index): self._keep_alive() pass def lset(self, name, index, value): self._keep_alive() pass # sets def sadd(self, name, value): self._keep_alive() if type(value) != str: value = str(value) sql = "INSERT INTO sets VALUES ('%s', '%s', '%s', '%s')" % (quote(name), hash(name), quote(value), hash(value)) try: self.connection.query(sql) except Exception, e: if e[0] == 1062: pass else: print e sys.exit() return True def sget(self, name): self._keep_alive() sql = "SELECT value FROM sets WHERE nhash='%s'" % hash(name) return self._get_one(sql) def srem(self, name, value): self._keep_alive() if type(value) != str: value = str(value) sql = "DELETE FROM sets WHERE nhash='%s' AND vhash='%s'" % (hash(name), hash(value)) self.connection.query(sql) return True def spop(self, name): self._keep_alive() tmp_id = hash(self._unique_id()) sql = "UPDATE sets SET nhash='%s' WHERE nhash='%s' LIMIT 1" % (tmp_id, hash(name)) self.connection.query(sql) sql = "SELECT value FROM sets WHERE nhash='%s'" % tmp_id ret = self._get_one(sql) sql = "DELETE FROM sets WHERE nhash='%s'" % tmp_id self.connection.query(sql) return ret def skeys(self, pattern): self._keep_alive() sql = "SELECT DISTINCT name FROM sets WHERE name LIKE '%s'" % quote(pattern) return self._get_col(sql) def slen(self, name): self._keep_alive() sql = "SELECT COUNT(*) FROM sets WHERE nhash='%s'" % hash(name) return self._get_one(sql) def sismember(self, name, value): self._keep_alive() if type(value) != str: value = str(value) sql = "SELECT COUNT(*) FROM sets WHERE nhash='%s' AND vhash='%s'" % (hash(name), hash(value)) return self._get_one(sql) == '1' def smembers(self, name): self._keep_alive() sql = "SELECT value FROM sets WHERE nhash='%s'" % hash(name) return self._get_col(sql) def sclear(self, name): self._keep_alive() sql = "DELETE FROM sets WHERE nhash='%s'" % hash(name) self.connection.query(sql) return True # misc def _keep_alive(self): try: self.connection.ping() except Exception, e: print 'reconnecting' if e[0] == 2006: self.__init__(*self.init_args) else: print e sys.exit() def _get_one(self, sql): self.connection.query(sql) res = self.connection.store_result() row = res.fetch_row() if row == (): return None return row[0][0] def _get_col(self, sql): self.connection.query(sql) res = self.connection.store_result() row = res.fetch_row() ret = [] while row != (): ret.append(row[0][0]) row = res.fetch_row() return ret def _unique_id(self): r = 0 for i in xrange(64): r = r * 10 + random.randint(0, 9) return str(r) 一些突如其来的想法,或者说是回忆起《Unix编程艺术》中的观念吧: 紧凑性:概念集合有多大?需要多少功夫来掌握?掌握之后多久会忘记?一般来说紧凑比不紧凑要好,概念集合小一点,掌握起来快,不容易忘记,开发时也不需要经常查阅文档。 提高开发效率的一些技术(web开发方面): prevent from upgrading echo “
以前写PHP的时候,哈希函数就只用过md5和sha1,也是最常用的两个。现在多数用python,又听说twitter用的是FNV,于是就用上了。 FNV产生的是一个整数,32位或者64位或者更多。一般用来做字符串哈希的话,需要64位。32位的碰撞率咱不放心。整数的结果,比起md5和sha1的字符串结果,存储空间要小一点。而且FNV也是比较快的。 现成的FNV的python模块是pyfasthash。这个项目名字叫pyfasthash,但模块名字叫pyhash。而且直接通过easy_install是安装不了的,在Downloads那里下载的解压出来也是安装不了的。要checkout svn里面的源码才可以。当初就是捣鼓一个晚上才发现。而且它依赖c++的boost库,编译时间很漫长。总之吐槽点很多。不过用起来还算顺手。 今晚看到了这个:http://sites.google.com/site/murmurhash/avalanche。看上去FNV很不济 -_,-。于是就试下里面最好的murmur。这个页面正是murmur的作者做的。官网在:http://sites.google.com/site/murmurhash/。虽然也有另外一个现成的murmur的python模块(地址),但是是只有32bit的没有64bit的。32bit的不够用。于是只好把它64bit的源码下回来自己封装。代码不多: #include "Python.h" unsigned long long MurmurHash64B ( const void * key, int len, unsigned int seed ) { const unsigned int m = 0x5bd1e995; const int r = 24; unsigned int h1 = seed ^ len; unsigned int h2 = 0; const unsigned int * data = (const unsigned int *)key; while(len >= 8) { unsigned int k1 = *data++; k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; len -= 4; unsigned int k2 = *data++; k2 *= m; k2 ^= k2 >> r; k2 *= m; h2 *= m; h2 ^= k2; len -= 4; } if(len >= 4) { unsigned int k1 = *data++; k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; len -= 4; } switch(len) { case 3: h2 ^= ((unsigned char*)data)[2] << 16; case 2: h2 ^= ((unsigned char*)data)[1] << 8; case 1: h2 ^= ((unsigned char*)data)[0]; h2 *= m; }; h1 ^= h2 >> 18; h1 *= m; h2 ^= h1 >> 22; h2 *= m; h1 ^= h2 >> 17; h1 *= m; h2 ^= h1 >> 19; h2 *= m; unsigned long long h = h1; h = (h << 32) | h2; return h; } static PyObject * hash(PyObject *self, PyObject *args) { const char *p; PyArg_ParseTuple(args, "s", &p); unsigned long long ret = MurmurHash64B(p, strlen(p), 0x19870714); return PyLong_FromUnsignedLongLong(ret); } static PyMethodDef PythonPymurmurModules[] = { {"hash", hash, METH_VARARGS, "64bit murmur hash function"}, {NULL, NULL, 0, NULL} }; PyMODINIT_FUNC initpymurmur(void) { Py_InitModule("pymurmur", PythonPymurmurModules); } 主要函数都是照搬的(看代码风格就知道 -_,-),只是改了下数据类型。uint64_t改成unsigned long long。distutils编译安装完毕。 100w个整数转字符串再hash的结果,FNV耗时5s+,murmur耗时1s+,快得很明显。 记录一下,无乜可观。 #include "ae.c" #include "zmalloc.c" #include <stdio.h> #include <time.h> #include <unistd.h> #define TIME 1000 #define MSG "FOOBAR" #define LEN 6 static int foo(aeEventLoop *el, long long id, void *data) { printf("%d\n", time(NULL)); write(*((int*)data), MSG, LEN); return TIME; } static void bar(aeEventLoop *el, int fd, void *data, int mask) { char buffer[LEN + 1]; read(fd, buffer, LEN); buffer[LEN] = '\0'; printf("received: %s\n", buffer); } int main() { int pipe_fd[2]; pipe(pipe_fd); aeEventLoop *el = aeCreateEventLoop(); aeCreateTimeEvent(el, TIME, foo, &pipe_fd[1], NULL); aeCreateFileEvent(el, pipe_fd[0], AE_READABLE, bar, NULL); aeMain(el); aeDeleteEventLoop(el); return 0; } 这个库不到400行,实现了fd和timeout事件,不错用。 申请到了dropbox api的access权限,于是注册了一个application,开始尝试用它来替代gmail作vps的备份。用的是python的binding。(我咋觉得前面几个句子洋文过多呢 -_,-) 注意dropbox的auth模块依赖oauth,所以没有装的话记得先easy_install oauth 首先要创建一个配置文件,把config/testing.ini.example复制一份就行。然后修改SOME_KEY和SOME_SECRET(在application的edit页的最下方,那2个随机字符串。YOUR_LOGIN_EMAIL和YOUR_LOGIN_PASSWORD不需要改,Oauth是不需要登录密码的。 下面先取得一个access token from dropbox.auth import * config = Authenticator.load_config('dropbox_config') auth = Authenticator(config) token = auth.obtain_request_token() print auth.build_authorize_url(token) raw_input() print auth.obtain_access_token(token, '') 用浏览器打开输出的链接,点Accept,再返回终端按回车(结束raw_input的输入,即可得到access token,记录下来,这个可以重复使用的。 试一下文件上传: from dropbox.auth import * config = Authenticator.load_config('dropbox_config') auth = Authenticator(config) from oauth import oauth access_token_string = '刚才得到的access token' access_token = oauth.OAuthToken.from_string(access_token_string) from dropbox.client import * client = DropboxClient( 'api.getdropbox.com', 'api-content.getdropbox.com', 80, auth, access_token) client.put_file('sandbox', '/', open('dropbox_config', 'r')) 打开注册application时输入的sandbox目录,可以看到dropbox_config文件已经上传成功了。 也可以用account_info方法来查看帐户信息 print client.account_info().data 其中包括了空间的容量和使用状况等信息。 有次帮某人发spam,找人来填调查。写了个自动评论的脚本,但是发出一定数量的评论之后就会遭遇验证码,于是决定破解之。 首先是需要取得验证码的样本,以作训练特征之用。而要取得验证码,首先要模拟登录的请求: usr = 'xx' psw = 'oo' resp = urllib2.urlopen('https://login.sina.com.cn/sso/login.php?username=%s&password=%s&returntype=TEXT' % ( usr, psw)) cookie = Cookie.SimpleCookie(resp.headers['set-cookie']) headers = { 'Referer': 'http://t.sina.com.cn', 'Cookie': cookie_header(cookie), 'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3pre) Gecko/20100405 Firefox/3.6.3plugin1', } def cookie_header(cookie): ret = '' for v in cookie.values(): ret += "%s=%s; " % (v.key, v.value) return ret headers就是后续的请求中,需要传递的参数了。 for i in xrange(100): req_img = urllib2.Request('http://t.sina.com.cn/pincode/pin.php?lang=zh&r=%d&rule' % int(time() * 1000), headers = headers) res_img = urllib2.urlopen(req_img) f = open('xinlang_pincode/%d.png' % i, 'wb') f.write(res_img.read()) f.close() 有一些验证码的回答是中文,中国首都什么的,这些不处理,直接返回失败。因为可以重复获取重新识别,不成问题的。下面是处理算术问题验证码的方法: from PIL import Image, ImageFilter, ImageEnhance file = 'xinlang_pincode/0.png' im = Image.open(file) im = im.convert() enhancer = ImageEnhance.Brightness(im) im = enhancer.enhance(2.0) #加亮,效果见图1 enhancer = ImageEnhance.Contrast(im) im = enhancer.enhance(4) #提高对比度,效果见图2 im = im.convert('1') #二值化,效果见图3 im = im.filter(ImageFilter.MedianFilter) #中值去噪,效果见图4 im.show() #调用xv命令来显示图片,方便debug 图1: 这样处理过之后,图片背景中的色块被过滤掉,杂点也被过滤掉,而数字的形状也没有太大的损失。 下面是分解字符,也就是将每一个数字或者+-*等符号分解出来: imim = im.load() WIDTH = 250 HEIGHT = 50 i = 0 has_start = False chars = [] while i < WIDTH: all_none = True for j in xrange(HEIGHT): if imim[i, j] != 255: all_none = False if all_none: if has_start: end_x = i has_start = False char = im.crop((start_x, 0, end_x, HEIGHT)) char.show() #到这一步的效果见图5 charchar = char.load() width = end_x - start_x y1 = 0 y2 = HEIGHT - 1 all_none = True while all_none: for ii in xrange(width): if charchar[ii, y1] != 255: all_none = False y1 += 1 all_none = True while all_none: for ii in xrange(width): if charchar[ii, y2] != 255: all_none = False y2 -= 1 char = char.crop((0, y1 - 1, width, y2 + 2)) char = char.resize((20, 20)) #将图片缩放到统一的大小 char.show() #到这一步的效果见图6 chars.append(char) else: if not has_start: start_x = i has_start = True i += 1 图5:字符被独立分割开 这一步得到的chars是下面要用到的。 然后是训练,也就是形成特征库。特征库规模越大,识别率也越高。不过训练起来也挺累的,有几十上百条也就好了。至少0到9和+-*=等几个字符的特征都要有: file = open('xinlang.img', 'a') for c in chars: nstr = '' im_loaded = c.load() for x in range(20): for y in range(20): if im_loaded[x, y] == 255: nstr += '0' else: nstr += '1' c.show() n = raw_input('? ') file.write(nstr+':'+n+'\n') file.close() 这里的特征,就是直接把每一个像素的信息,用0和1组成的字符串表示。 比对函数: pattern = [] for l in open('xinlang.img', 'r').read().split('\n'): pattern.append(l.split(':')) del pattern[-1] def what(img): im = img.load() nstr = '' for x in xrange(20): #生成目标图像的特征字符串 for y in xrange(20): if im[x, y] == 255: nstr += '0' else: nstr += '1' minmin = 400 res = None for p in pattern: cur = 0 for i in xrange(400): if nstr[i] != p[0][i]: #比对每一个像素,如果不相同,则增加差异值 cur += 1 if cur < = minmin: #记录下差异值最小时所对应的字符 minmin = cur res = p[1] return res 最后测试一下: for c in chars: print what(c), 结果: 这个验证码还是挺好破解的,因为字符之间间距很大,而且没有旋转,没有扭曲,不需要多少变换就能得到可用的结果。像google的那种,就完全没法可想了。 以下由实际代码转化而成(因为涉及到私密数据)。 假设f1, f2, f3, f4, f5都是2个参数的函数,现在需要顺序执行它们,而且一个函数出现异常,不能影响到后续的函数。并且所有异常处理的代码都是一样的。 “拿不准就穷举”,于是: try: f1(6, 4) except Exception, e: #do something try: f2(6, 4) except Exception, e: #do something try: f3(6, 4) except Exception, e: #do something try: f4(6, 4) except Exception, e: #do something try: f5(6, 4) except Exception, e: #do something 简单粗暴,但是重复太多,每次需要修改#do something部分时都很麻烦。 于是有了使用decorator的版本: class do_something(): def __init__(self, f): try: f() except Exception, e: #do something def __call__(self): pass @do_something def run_f1(): f1(6 ,4) @do_something def run_f2(): f2(6 ,4) @do_something def run_f3(): f3(6 ,4) @do_something def run_f4(): f4(6 ,4) @do_something def run_f5(): f5(6 ,4) run_f1() run_f2() run_f3() run_f4() run_f5() 这样消除了#do something部分的重复,但是每个函数都需要再包装一次以便decorate(不能直接给f1-f5加),显得很累赘 后来醒悟起这个方法,短小精悍了很多: functions = ( (f1, (6, 4)), (f2, (6, 4)), (f3, (6, 4)), (f4, (6, 4)), (f5, (6, 4)), ) for func in functions: try: func[0](*func[1]) except Exception, e: #do something 聊作记录。 基本功能就几个:存储一个document,发送feed去某个box,在某个box中标记feed为up或者down(和undo),还有列出box中的document。用来替代之前的基于mongodb的存储层的。 #!/usr/bin/env python # coding: utf8 import _mysql import _mysql_exceptions _connection = None _host = 'localhost' _user = 'root' _password = '' _database = 'test' def connect(host = _host, user = _user, password = _password, database = _database): global _host, _user, _password, _database _host = host _user = user _password = password _database = database def db(): global _connection, _host, _user, _password, _database if _connection == None: _connection = _mysql.connect(_host, _user, _password, _database) _connection.set_character_set('UTF8') _connection.query("SET NAMES UTF8") return _connection import json encoder = json.JSONEncoder() decoder = json.JSONDecoder() import pyhash hasher = pyhash.fnv1a_64() import time import random def doc(data, key = None): if key is None: r = '' for i in xrange(16): #这个太小会造成碰撞 r += str(random.randint(0, 9)) key = hasher(str(time.time() * 1000000) + r) else: key = hasher(encoder.encode(key)) _ensure_table_exists('doc', '''CREATE TABLE doc ( id INT(16) PRIMARY KEY AUTO_INCREMENT, doc LONGTEXT NOT NULL, hash CHAR(64) UNIQUE KEY ) ENGINE=MyISAM''') try: db().query('''INSERT INTO doc VALUES (NULL, '%s', '%d')''' % (_mysql.escape_string(encoder.encode(data)), key)) except _mysql_exceptions.IntegrityError, e: if e[0] == 1062: return False return db().insert_id() _sql_box = '''CREATE TABLE box_%s ( no INT(16) PRIMARY KEY AUTO_INCREMENT, id INT(16), up INT(1), INDEX up (up) ) ENGINE=MyISAM''' def push(doc, box): _ensure_table_exists('box_%s' % box, _sql_box % box) db().query('''INSERT INTO box_%s VALUES (NULL, '%s', 1)''' % (box, str(doc))) def down(no, box): _ensure_table_exists('box_%s' % box, _sql_box % box) db().query("UPDATE box_%s SET up=0 WHERE no=%s" % (box, str(no))) if db().affected_rows() == 1: setvar('last_down_%s' % box, no, False) return True def undown(box): _ensure_table_exists('box_%s' % box, _sql_box % box) no = getvar('last_down_%s' % box) if no is not None: db().query("UPDATE box_%s SET up=1 WHERE no=%s" % (box, str(no))) delvar('last_down_%s' % box, no) def up(no, box): _ensure_table_exists('box_%s' % box, _sql_box % box) db().query("UPDATE box_%s SET up=1 WHERE no=%s" % (box, str(no))) if db().affected_rows() == 1: setvar('last_up_%s' % box, no, False) return True def unup(box): _ensure_table_exists('box_%s' % box, _sql_box % box) no = getvar('last_up_%s' % box) if no is not None: db().query("UPDATE box_%s SET up=0 WHERE no=%s" % (box, str(no))) delvar('last_up_%s' % box, no) def box(box, limit = None, order = 'ASC'): _ensure_table_exists('box_%s' % box, _sql_box % box) docs = [] #储存docs的数组,因为要按照box的顺序返回docs,所以要保存 sql = 'SELECT no,id FROM box_%s WHERE up=1 ORDER BY no %s' % (box, order) if limit is not None: sql += ' LIMIT %d' % limit db().query(sql) res = db().store_result() row = res.fetch_row() box = [] while row != (): box.append((row[0][0], row[0][1])) # 0 -> no, 1 -> id row = res.fetch_row() if box == []: return [] sql = 'SELECT id, doc FROM doc WHERE id IN (%s)' % ','.join(set([s[1] for s in box])) db().query(sql) res = db().store_result() row = res.fetch_row() doc = {} while row != (): doc[row[0][0]] = decoder.decode(row[0][1]) row = res.fetch_row() ret = [] for i in box: ret.append(Entry(i[0], i[1], doc[i[1]])) return ret class Entry(): def __init__(self, no, id, doc): self.no = no self.id = id self.doc = doc def __getitem__(self, i): return self.doc[i] def __setitem__(self, k, v): self.doc[k] = v def has_key(self, key): return self.doc.has_key(key) _sql_var = '''CREATE TABLE %s ( name CHAR(255) NOT NULL, value TEXT NOT NULL, time CHAR(16) NOT NULL, INDEX name (name), INDEX time (time) ) ENGINE=MyISAM''' def getvar(name, table = 'var'): _ensure_table_exists(table, _sql_var % table) sql = "SELECT value FROM %s WHERE name='%s' ORDER BY time DESC LIMIT 1" % (table, name) db().query(sql) res = db().store_result().fetch_row() if res == (): return None return res[0][0] def setvar(name, value, update = True, table = 'var'): _ensure_table_exists(table, _sql_var % table) if update: db().query("DELETE FROM %s WHERE name='%s'" % (table, name)) db().query("INSERT INTO %s VALUES ('%s', '%s', '%d')" % (table, name, _mysql.escape_string(str(value)), int(time.time() * 1000000))) def delvar(name, value = None, table = 'var'): _ensure_table_exists(table, _sql_var % table) sql = "DELETE FROM %s WHERE name='%s'" % (table, name) if value is not None: sql += " AND value='%s'" % _mysql.escape_string(value) sql += 'ORDER BY time DESC LIMIT 1' db().query(sql) _key_table_created = {} def _ensure_table_exists(name, sql): global _key_table_created if _key_table_created.has_key(name): return try: db().query("SELECT * FROM %s LIMIT 1" % name) db().store_result() _key_table_created[name] = True except Exception, e: if e[0] == 1146: # table/collection doesn't exists db().query(sql) _key_table_created[name] = True else: print e def s(i): return encoder.encode(i) def u(i): return decoder.decode(i) |
||||||
|
Copyright © 2010 声zzz - All Rights Reserved |
||||||
最新评论