Last active
July 23, 2017 00:04
-
-
Save northface/ee3ecc95f4c320e663e76d3daaa34d9a to your computer and use it in GitHub Desktop.
濁点「゛」・半濁点「゜」のある文字が一文字として扱われない時に置換するための方法(例:「ば」(1文字)→「は゛」(2文字)のようなケース)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "toc": "true" | |
| }, | |
| "source": [ | |
| "# Table of Contents\n", | |
| " <p><div class=\"lev1 toc-item\"><a href=\"#事前調査\" data-toc-modified-id=\"事前調査-1\"><span class=\"toc-item-num\">1 </span>事前調査</a></div><div class=\"lev1 toc-item\"><a href=\"#置換辞書作成\" data-toc-modified-id=\"置換辞書作成-2\"><span class=\"toc-item-num\">2 </span>置換辞書作成</a></div><div class=\"lev1 toc-item\"><a href=\"#ファイル内の置換\" data-toc-modified-id=\"ファイル内の置換-3\"><span class=\"toc-item-num\">3 </span>ファイル内の置換</a></div><div class=\"lev2 toc-item\"><a href=\"#テキストファイルの場合\" data-toc-modified-id=\"テキストファイルの場合-31\"><span class=\"toc-item-num\">3.1 </span>テキストファイルの場合</a></div><div class=\"lev2 toc-item\"><a href=\"#Excelファイルの場合\" data-toc-modified-id=\"Excelファイルの場合-32\"><span class=\"toc-item-num\">3.2 </span>Excelファイルの場合</a></div>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# 事前調査" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "UTF-8\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import sys\n", | |
| "print(sys.stdout.encoding)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# -*- coding: utf-8 -*- \n", | |
| "t= \"テ\"\n", | |
| "t1 = \"デ1\" #濁点が分かれていないっぽいやつ \n", | |
| "t2 = \"デ2\" #濁点が分かれているっぽいやつ\n", | |
| "h=\"ヘ\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "b'\\xe3\\x83\\x86'" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "t.encode('utf-8')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "b'\\xe3\\x83\\x871'" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "t1.encode('utf-8')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "b'\\xe3\\x83\\x86\\xe3\\x82\\x992'" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "t2.encode('utf-8')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "bytes" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "type(t1.encode('utf-8'))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'デ'" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "b'\\xe3\\x83\\x87'.decode()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'デ'" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "b'\\xe3\\x83\\x86\\xe3\\x82\\x99'.decode()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'テ'" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "b'\\xe3\\x83\\x86'.decode()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'゙'" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "b'\\xe3\\x82\\x99'.decode()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "b'\\xe3\\x83\\x98'" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "h.encode('utf-8')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "b'\\xe3\\x81\\x86\\xe3\\x82\\x9b'" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'う゛'.encode('utf-8')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "b'\\xe3\\x81\\x86'" | |
| ] | |
| }, | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'う'.encode('utf-8')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'ゔ'" | |
| ] | |
| }, | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "b'\\xe3\\x81\\x86\\xe3\\x82\\x99'.decode() #「う」に点々" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "b'\\xe3\\x82\\xa6'" | |
| ] | |
| }, | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'ウ'.encode('utf-8')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "b'\\xe3\\x83\\xb4'" | |
| ] | |
| }, | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'ヴ'.encode('utf-8')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'ヴ'" | |
| ] | |
| }, | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "b'\\xe3\\x82\\xa6\\xe3\\x82\\x99'.decode()#「ウ」に点々" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# 置換辞書作成" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "#参考[はてなの鴨澤 2015-10-15 Unicodeでは濁点や半濁点を別扱いしてることがあるので結合した]\n", | |
| "#http://d.hatena.ne.jp/kamosawa/20151015\n", | |
| "#「c +'\\u309a'」:cに入ってる文字そのものと半濁点('\\u309a')を足した合成文字\n", | |
| "#「chr(ord(c)+2)」:ord(c)でcのコードポイントを数値にし、それに2を足すことで半濁点付きの文字を1文字で表現した文字\n", | |
| "#「chr(ord(c)+1)」:ord(c)でcのコードポイントを数値にし、それに1を足すことで濁点付きの文字を1文字で表現した文字\n", | |
| "\n", | |
| "repdict=dict()\n", | |
| "for tap in [(c +'\\u309a' , chr(ord(c)+2)) for c in u'はひふへほハヒフヘホ']:\n", | |
| " repdict.update({tap[0]:tap[1]})\n", | |
| "for tap in [(chr(ord(c)) +'\\u3099' , chr(ord(c)+1)) for c in u'かきくけこさしすせそたちつてとはひふへほカキクケコサシスセソタチツテトハヒフヘホ']:\n", | |
| " repdict.update({tap[0]:tap[1]})\n", | |
| "\n", | |
| "#ウとヴに対応\n", | |
| "repdict.update({b'\\xe3\\x82\\xa6\\xe3\\x82\\x99'.decode():'ヴ'})" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'が': 'が',\n", | |
| " 'ぎ': 'ぎ',\n", | |
| " 'ぐ': 'ぐ',\n", | |
| " 'げ': 'げ',\n", | |
| " 'ご': 'ご',\n", | |
| " 'ざ': 'ざ',\n", | |
| " 'じ': 'じ',\n", | |
| " 'ず': 'ず',\n", | |
| " 'ぜ': 'ぜ',\n", | |
| " 'ぞ': 'ぞ',\n", | |
| " 'だ': 'だ',\n", | |
| " 'ぢ': 'ぢ',\n", | |
| " 'づ': 'づ',\n", | |
| " 'で': 'で',\n", | |
| " 'ど': 'ど',\n", | |
| " 'ば': 'ば',\n", | |
| " 'ぱ': 'ぱ',\n", | |
| " 'び': 'び',\n", | |
| " 'ぴ': 'ぴ',\n", | |
| " 'ぶ': 'ぶ',\n", | |
| " 'ぷ': 'ぷ',\n", | |
| " 'べ': 'べ',\n", | |
| " 'ぺ': 'ぺ',\n", | |
| " 'ぼ': 'ぼ',\n", | |
| " 'ぽ': 'ぽ',\n", | |
| " 'ヴ': 'ヴ',\n", | |
| " 'ガ': 'ガ',\n", | |
| " 'ギ': 'ギ',\n", | |
| " 'グ': 'グ',\n", | |
| " 'ゲ': 'ゲ',\n", | |
| " 'ゴ': 'ゴ',\n", | |
| " 'ザ': 'ザ',\n", | |
| " 'ジ': 'ジ',\n", | |
| " 'ズ': 'ズ',\n", | |
| " 'ゼ': 'ゼ',\n", | |
| " 'ゾ': 'ゾ',\n", | |
| " 'ダ': 'ダ',\n", | |
| " 'ヂ': 'ヂ',\n", | |
| " 'ヅ': 'ヅ',\n", | |
| " 'デ': 'デ',\n", | |
| " 'ド': 'ド',\n", | |
| " 'バ': 'バ',\n", | |
| " 'パ': 'パ',\n", | |
| " 'ビ': 'ビ',\n", | |
| " 'ピ': 'ピ',\n", | |
| " 'ブ': 'ブ',\n", | |
| " 'プ': 'プ',\n", | |
| " 'ベ': 'ベ',\n", | |
| " 'ペ': 'ペ',\n", | |
| " 'ボ': 'ボ',\n", | |
| " 'ポ': 'ポ'}" | |
| ] | |
| }, | |
| "execution_count": 19, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "repdict" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# ファイル内の置換" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## テキストファイルの場合" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "#テキストの場合\n", | |
| "f=open('dakutensample.txt')\n", | |
| "contents=f.read()\n", | |
| "f.close()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'ゼロからはじめるデータサイエンス_oreilly-978-4-87311-786-7e.pdf\\n• ユーザーは定期的にオンライン状態のデータ同期を実行することにより、オフライン状態でもデータの登録・編集を含めたシステムの利用が可能とする \\n• PC環境はWebアプリケーションを前提とするが、クライアント側アプリケーションのインストールが必要な場合は明記すること '" | |
| ] | |
| }, | |
| "execution_count": 21, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "contents" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "for key in repdict.keys():\n", | |
| " contents=contents.replace(key, repdict.get(key))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "w=open('dakutenresults.txt', 'w')\n", | |
| "w.write(contents)\n", | |
| "w.close()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'ゼロからはじめるデータサイエンス_oreilly-978-4-87311-786-7e.pdf\\n• ユーザーは定期的にオンライン状態のデータ同期を実行することにより、オフライン状態でもデータの登録・編集を含めたシステムの利用が可能とする \\n• PC環境はWebアプリケーションを前提とするが、クライアント側アプリケーションのインストールが必要な場合は明記すること '" | |
| ] | |
| }, | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "contents" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "実際に確かめてみる" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "b'\\xe3\\x82\\xb5\\xe3\\x82\\x99'" | |
| ] | |
| }, | |
| "execution_count": 25, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'ザ'.encode('utf-8')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "b'\\xe3\\x82\\xb6'" | |
| ] | |
| }, | |
| "execution_count": 26, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'ザ'.encode('utf-8')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Excelファイルの場合" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from openpyxl import load_workbook" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def henkan(ws):\n", | |
| " col = list(ws.columns)\n", | |
| " row = list(ws.rows)\n", | |
| "\n", | |
| " for i in range(len(col)):\n", | |
| " for j in range(len(row)):\n", | |
| " if isinstance(col[i][j].value,type(None)):\n", | |
| " continue\n", | |
| " for key in repdict.keys():\n", | |
| " col[i][j].value = col[i][j].value.replace(key, repdict.get(key))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "wb = load_workbook(filename = \"excelsample.xlsx\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 30, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "for i in wb.sheetnames:\n", | |
| " ws = wb[i]\n", | |
| " henkan(ws)\n", | |
| "\n", | |
| "wb.save(filename = 'results.xlsx')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 31, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "b'\\xe3\\x83\\x90'" | |
| ] | |
| }, | |
| "execution_count": 31, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'バ'.encode('utf-8')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 32, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "b'\\xe3\\x83\\x9d'" | |
| ] | |
| }, | |
| "execution_count": 32, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'ポ'.encode('utf-8')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.0" | |
| }, | |
| "toc": { | |
| "colors": { | |
| "hover_highlight": "#DAA520", | |
| "running_highlight": "#FF0000", | |
| "selected_highlight": "#FFD700" | |
| }, | |
| "moveMenuLeft": true, | |
| "nav_menu": { | |
| "height": "102px", | |
| "width": "252px" | |
| }, | |
| "navigate_menu": true, | |
| "number_sections": true, | |
| "sideBar": true, | |
| "threshold": 4, | |
| "toc_cell": true, | |
| "toc_section_display": "block", | |
| "toc_window_display": true, | |
| "widenNotebook": false | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment