Last active
November 11, 2025 02:27
-
-
Save paithiov909/7666e9fe29a62425ebb37c4107d46958 to your computer and use it in GitHub Desktop.
Scripts📝
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| akita <- jisx0402::municipality |> | |
| dplyr::filter( | |
| pref_code == "05", | |
| is.na(end_date), | |
| !stringr::str_ends(name, "郡") | |
| ) |> | |
| dplyr::mutate( | |
| muni_code = paste0(pref_code, city_code), | |
| muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)), | |
| start_year = lubridate::year(start_date) | |
| ) |> | |
| dplyr::select(muni_code, name, start_year) |> | |
| dplyr::arrange(desc(start_year)) | |
| year <- max(akita$start_year, na.rm = TRUE) | |
| dat <- arrow::read_parquet("~/Downloads/data/jpop.parquet") |> | |
| dplyr::filter(.data$year >= .env$year) |> | |
| dplyr::right_join(akita, by = c("code" = "muni_code")) | |
| require(ggplot2) | |
| dat |> | |
| dplyr::filter(name == "秋田市", age == "all") |> | |
| ggplot(aes(year, population)) + | |
| geom_line(aes(colour = sex)) + | |
| scale_y_log10() | |
| ga <- jisx0402::jptopography("designated") |> | |
| dplyr::filter(stringr::str_starts(muni_code, "05")) |> | |
| rlang::as_function( | |
| ~ dplyr::mutate(., | |
| muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)), | |
| area = units::set_units(sf::st_area(.), km^2) | |
| ) | |
| )() |> | |
| dplyr::right_join( | |
| dplyr::filter(dat, age == "all"), | |
| by = c("muni_code" = "code") | |
| ) |> | |
| dplyr::group_by(sex) |> | |
| dplyr::mutate( | |
| denst = round(population / area), | |
| rank = dplyr::percent_rank(denst) * 10 | |
| ) |> | |
| dplyr::ungroup() |> | |
| ggplot() + | |
| geom_sf(aes(fill = rank), na.rm = TRUE, show.legend = FALSE) + | |
| facet_wrap(~ sex) + | |
| labs( | |
| title = "Rank of Population Density in Akita, throughout 2006-2023", | |
| subtitle = "Year: {frame_time}", | |
| caption = paste( | |
| "地図データ:「国土数値情報 行政区域データ」(国土交通省)", | |
| "https://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-N03-v3_0.html を加工して作成", | |
| sep = "\n" | |
| ) | |
| ) + | |
| theme_light() + | |
| gganimate::transition_time(year) | |
| gganimate::animate(ga, renderer = gganimate::ffmpeg_renderer()) | |
| gganimate::anim_save("akita-denst1.mp4", path = "Videos") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| tokyo <- jisx0402::municipality |> | |
| dplyr::filter( | |
| pref_code == "13", | |
| is.na(end_date), | |
| !stringr::str_ends(name, "郡|(特別区)|(支庁)") | |
| ) |> | |
| dplyr::mutate( | |
| muni_code = paste0(pref_code, city_code), | |
| muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)), | |
| start_year = lubridate::year(start_date) | |
| ) |> | |
| dplyr::select(muni_code, name, start_year) |> | |
| dplyr::arrange(desc(start_year)) | |
| year <- max(tokyo$start_year, na.rm = TRUE) | |
| dat <- arrow::read_parquet("~/Downloads/data/jpop.parquet") |> | |
| dplyr::filter(.data$year >= .env$year) |> | |
| dplyr::right_join(tokyo, by = c("code" = "muni_code")) | |
| require(ggplot2) | |
| dat |> | |
| dplyr::filter(age == "all", stringr::str_detect(name, "区"), | |
| year > 2005L) |> | |
| ggplot(aes(year, population)) + | |
| geom_line(aes(colour = sex)) + | |
| facet_wrap(~ name) + | |
| scale_y_log10() | |
| ga <- jisx0402::jptopography("all", resolution = 1) |> | |
| dplyr::filter(stringr::str_starts(muni_code, "13")) |> | |
| rlang::as_function( | |
| ~ dplyr::mutate(., | |
| muni_code = paste0(muni_code, jisx0402::check_digit(muni_code)), | |
| area = units::set_units(sf::st_area(.), km^2) | |
| ) | |
| )() |> | |
| dplyr::right_join( | |
| dplyr::filter(dat, age == "all", stringr::str_detect(name, "区")), | |
| by = c("muni_code" = "code") | |
| ) |> | |
| dplyr::group_by(sex) |> | |
| dplyr::mutate( | |
| denst = round(population / area), | |
| rank = dplyr::percent_rank(denst) * 10 | |
| ) |> | |
| dplyr::ungroup() |> | |
| ggplot() + | |
| geom_sf(aes(fill = rank), na.rm = TRUE, show.legend = FALSE) + | |
| facet_wrap(~ sex) + | |
| labs( | |
| title = "Rank of Population Density in Tokyo Special Wards, throughout 2006-2023", | |
| subtitle = "Year: {frame_time}", | |
| caption = paste( | |
| "地図データ:「国土数値情報 行政区域データ」(国土交通省)", | |
| "https://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-N03-v3_0.html を加工して作成", | |
| sep = "\n" | |
| ) | |
| ) + | |
| theme_light() + | |
| gganimate::transition_time(year, range = c(2006L, 2023L)) | |
| gganimate::animate(ga, renderer = gganimate::ffmpeg_renderer()) | |
| gganimate::anim_save("tokyo23-denst1.mp4", path = "Videos") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #' @export | |
| accelerate <- function(text = "精神を加速させろ") { | |
| r"( | |
| ∩_∩ | |
| / \ /\ | |
| |(゚)=(゚)| | |
| | ●_● | | |
| / ヽ | |
| r⌒| 〃 ------ ヾ | | |
| / i/ |_二__ノ | |
| ./ / / ) {text} | |
| ./ / / // | |
| / ./ / ̄ | |
| .ヽ、__./ / ⌒ヽ | |
| r / | | |
| / ノ | |
| / / / | |
| ./ // / | |
| /. ./ ./ / | |
| i / ./ / | |
| i ./ .ノ.^/ | |
| i ./ |_/ | |
| i / | |
| / / | |
| (_/ | |
| )" |> stringi::stri_replace_all_fixed(" ", " ") |> glue::glue() | |
| } | |
| #' @export | |
| qb <- function(text = "わけがわからないよ") { | |
| r"( | |
| |\ /| | |
| |\\ //| | |
| : ,> `´ ̄`´ < ′ | |
| . V V | |
| . i< ● ● >i | |
| 八 、_,_, 八 {text} | |
| . / 个 . _ _ . 个 ', | |
| _/ il ,' '. li ',__ | |
| )" |> stringi::stri_replace_all_fixed(" ", " ") |> glue::glue() | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # get_lyrics.R ----- | |
| #' Scrape lyrics from list | |
| #' | |
| #' @param df A tibble that comes of \code{get_lyrics_list}. | |
| #' @param file String; file name to append lyrircs. | |
| #' @param links String; column name of lyrics links. | |
| #' @returns `file` is returned invisibly. | |
| #' @export | |
| #' @examples | |
| #' \dontrun{ | |
| #' csv_file <- | |
| #' aznyan::get_lyrics_list("23729") |> | |
| #' aznyan::get_lyrics("23729.csv") | |
| #' tbl <- | |
| #' readr::read_csv(csv_file, col_names = F, col_types = "cccc___cDn") |> | |
| #' dplyr::rename( | |
| #' title = X1, | |
| #' artist = X2, | |
| #' lyricist = X3, | |
| #' composer = X4, | |
| #' text = X8, | |
| #' released = X9, | |
| #' page_view = X10 | |
| #' ) | |
| #' } | |
| get_lyrics <- function(df, file, links = "link") { | |
| base_url <- "https://www.uta-net.com" | |
| links <- dplyr::pull(df, {{ links }}) | |
| url <- paste(base_url, links, sep = "/") | |
| session <- polite::bow(base_url, force = FALSE) | |
| purrr::iwalk(url, function(q, itr) { | |
| html <- session %>% | |
| polite::nod(q) %>% | |
| polite::scrape() | |
| lyric_body <- html %>% | |
| rvest::html_element("#kashi_area") %>% | |
| rvest::html_text2() | |
| info <- html %>% | |
| rvest::html_element(".song-infoboard") %>% | |
| rvest::html_element(".detail") %>% | |
| rvest::html_text() %>% | |
| stringr::str_split("\\n") %>% | |
| unlist() %>% | |
| purrr::pluck(4) %>% | |
| stringr::str_extract_all(pattern = "[\\d,/]+") %>% | |
| unlist() | |
| data.frame( | |
| df[itr, ], | |
| lyric = lyric_body, | |
| released = info[1], | |
| page_viewed = info[2] | |
| ) %>% | |
| dplyr::mutate( | |
| released = lubridate::as_date(.data$released), | |
| page_viewed = stringr::str_remove_all(.data$page_viewed, ",") %>% | |
| unlist() %>% | |
| as.numeric() | |
| ) %>% | |
| readr::write_csv(file, append = TRUE, progress = FALSE) | |
| }) | |
| invisible(file) | |
| } | |
| # lyrics_list.R ----- | |
| #' Scrape table of lyrics list | |
| #' | |
| #' @param id String; substring xxx of 'https://www.uta-net.com/:type:/xxx/'. | |
| #' @param type String; one of "artist", "lyricist", or "composer". | |
| #' @returns tibble. | |
| #' @export | |
| get_lyrics_list <- function(id, | |
| type = c("artist", "lyricist", "composer")) { | |
| base_url <- "https://www.uta-net.com" | |
| type <- rlang::arg_match(type, c("artist", "lyricist", "composer")) | |
| url <- paste(base_url, type, id, "", sep = "/") | |
| session <- | |
| polite::bow(base_url, force = FALSE) %>% | |
| polite::nod(url) | |
| html <- session %>% | |
| polite::scrape() | |
| page_list <- html %>% | |
| rvest::html_element(".songlist-table-block") %>% | |
| rvest::html_element("tfoot") %>% | |
| rvest::html_text() %>% | |
| stringr::str_extract("([:number:]+)") | |
| purrr::map_dfr(seq.int(as.integer(page_list)), function(i) { | |
| html <- session %>% | |
| polite::nod(path = paste(url, "0", as.character(i), "", sep = "/")) %>% | |
| polite::scrape() | |
| tables <- html %>% | |
| rvest::html_elements(".songlist-table-block") %>% | |
| rvest::html_elements("table") | |
| df <- tables %>% | |
| rvest::html_table() %>% | |
| purrr::map_dfr(~ stats::na.omit(.)) | |
| titles <- html %>% | |
| rvest::html_elements(".songlist-table-block") %>% | |
| rvest::html_elements(".songlist-title") %>% | |
| rvest::html_text() | |
| links <- tables %>% | |
| rvest::html_elements(".sp-w-100") %>% | |
| rvest::html_elements("a") %>% | |
| rvest::html_attr("href") %>% | |
| purrr::discard(~ . %in% c("https://www.uta-net.com/ranking/total/")) | |
| df %>% | |
| dplyr::slice_head(n = nrow(df) - 1) %>% | |
| dplyr::rename( | |
| text_lab = "\u66f2\u540d", | |
| artist = "\u6b4c\u624b\u540d", | |
| lyricist = "\u4f5c\u8a5e\u8005\u540d", | |
| composer = "\u4f5c\u66f2\u8005\u540d", | |
| leading = "\u6b4c\u3044\u51fa\u3057" | |
| ) %>% | |
| dplyr::bind_cols( | |
| data.frame(title = titles, link = links, source_page = i) | |
| ) %>% | |
| dplyr::select( | |
| .data$title, | |
| .data$artist, | |
| .data$lyricist, | |
| .data$composer, | |
| .data$leading, | |
| .data$link, | |
| .data$source_page | |
| ) | |
| }) | |
| } | |
| #' Search lyrics list by keyword | |
| #' | |
| #' @param keyword String; search phrase. | |
| #' @param sort String; one of "new", "popular", "title", or "artist". | |
| #' @return tibble | |
| #' @export | |
| search_lyrics_list <- function(keyword, | |
| sort = c("new", "popular", "title", "artist")) { | |
| base_url <- "https://www.uta-net.com" | |
| sort <- rlang::arg_match(sort, c("new", "popular", "title", "artist")) | |
| sort <- dplyr::case_when( | |
| sort == "new" ~ 6, | |
| sort == "popular" ~ 4, | |
| sort == "title" ~ 1, | |
| sort == "artist" ~ 7, | |
| TRUE ~ 1 | |
| ) | |
| url <- paste(base_url, "search", "", sep = "/") | |
| session <- | |
| polite::bow(base_url, force = FALSE) %>% | |
| polite::nod(url) | |
| html <- session %>% | |
| polite::scrape(query = list( | |
| Keyword = stringr::str_trim(keyword), | |
| Aselect = "2", | |
| Bselect = "3", | |
| sort = sort | |
| )) | |
| page_list <- html %>% | |
| rvest::html_element("#songlist-sort-paging") %>% | |
| rvest::html_text() %>% | |
| stringr::str_extract("([:number:]+)") | |
| purrr::map_dfr(seq.int(as.integer(page_list)), function(i) { | |
| html <- session %>% | |
| polite::scrape(query = list( | |
| Keyword = enc2utf8(keyword), | |
| Aselect = "2", | |
| Bselect = "3", | |
| sort = sort, | |
| pnum = i | |
| )) | |
| tables <- html %>% | |
| rvest::html_elements(".songlist-table-block") %>% | |
| rvest::html_elements("table") | |
| df <- tables %>% | |
| rvest::html_table() %>% | |
| purrr::map_dfr(~ na.omit(.)) | |
| titles <- html %>% | |
| rvest::html_elements(".songlist-table-block") %>% | |
| rvest::html_elements(".songlist-title") %>% | |
| rvest::html_text() | |
| links <- tables %>% | |
| rvest::html_elements(".sp-w-100") %>% | |
| rvest::html_elements("a") %>% | |
| rvest::html_attr("href") %>% | |
| purrr::discard(~ . %in% c("https://www.uta-net.com/ranking/total/")) | |
| df %>% | |
| dplyr::slice_head(n = nrow(df) - 1) %>% | |
| dplyr::rename( | |
| text_lab = "\u66f2\u540d", | |
| artist = "\u6b4c\u624b\u540d", | |
| lyricist = "\u4f5c\u8a5e\u8005\u540d", | |
| composer = "\u4f5c\u66f2\u8005\u540d", | |
| leading = "\u6b4c\u3044\u51fa\u3057" | |
| ) %>% | |
| dplyr::bind_cols( | |
| data.frame(title = titles, link = links, source_page = i) | |
| ) %>% | |
| dplyr::select( | |
| .data$title, | |
| .data$artist, | |
| .data$lyricist, | |
| .data$composer, | |
| .data$leading, | |
| .data$link, | |
| .data$source_page | |
| ) | |
| }) | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # pkgload::load_all(export_all = FALSE) | |
| use("rasengan", "%!*%") | |
| t <- seq(0, 8 * pi, length.out = 501)[-1] | |
| helix <- | |
| dplyr::tibble( | |
| x = cos(t), | |
| y = sin(t), | |
| z = seq(-1, 1, length.out = length(t)), | |
| w = 1 | |
| ) | |
| s <- seq(-pi * .5, pi * 1.5, length.out = 100) | |
| pov <- | |
| dplyr::tibble( | |
| x = 3, | |
| y = 3 * cos(s), | |
| z = 3 * sin(s) | |
| ) |> | |
| as.matrix() | |
| # rgl::plot3d(helix, col = "red") | |
| # rgl::points3d(pov, col = "blue") | |
| gifski::save_gif( | |
| { | |
| for (view in seq_len(nrow(pov))) { | |
| xfm <- | |
| affiner::transform3d() %*% | |
| rasengan::lookat3d(pov[view, ], c(0.1, 0, 0), c(0, 1, 0)) %*% | |
| rasengan::persp3d(pi / 4, 4 / 3) | |
| helix_screen <- (helix %*% xfm %!*% rasengan::viewport3d(480, 360)) | |
| plot( | |
| helix_screen[, 1], | |
| helix_screen[, 2], | |
| type = "p", | |
| col = "blue", | |
| asp = 4 / 3, | |
| cex = 1, | |
| pch = 16, | |
| xlim = c(0, 480), | |
| ylim = c(0, 360) | |
| ) | |
| } | |
| }, | |
| gif_file = "camera-test.gif", | |
| width = 480, | |
| height = 360, | |
| delay = 1 / 15, | |
| progress = TRUE | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ```{r} | |
| #| label: help-text | |
| #| echo: false | |
| #| output: asis | |
| text <- withr::with_options(list(useFancyQuotes = FALSE), | |
| capture.output({ | |
| tools::Rd2HTML( | |
| tools::Rd_db("base")[["use.Rd"]], | |
| outputEncoding = "UTF-8" | |
| ) | |
| }) |> | |
| paste0(collapse = "\n") |> | |
| rvest::read_html() |> | |
| rvest::html_nodes("main") |> | |
| rvest::html_text() | |
| ) | |
| text |> | |
| stringr::str_split("\n") |> | |
| unlist(use.names = FALSE) |> | |
| rlang::as_function(~ paste("> ", .))() |> | |
| commonmark::markdown_commonmark() |> | |
| cat() | |
| ``` |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## An example to draw hand-drawn-like lines using ripple effect with 'ggfx' | |
| ## Feel free to copy and modify this code as you see fit! | |
| Rcpp::sourceCpp(code = R"{ | |
| /** | |
| * Requires <https://github.com/paithiov909/nativeshadr> to be installed. | |
| * The shader part is based on <https://github.com/prontopablo/FIP/blob/main/data/ripple.glsl> | |
| */ | |
| // [[Rcpp::depends(RcppParallel, nativeshadr)]] | |
| #include <nativeshadr.h> | |
| float4 frag(int2 wh, RMatrix<int> nr, float freq, float amp, float2 offset) { | |
| float2 iResolution = float2(nr.ncol(), nr.nrow()); | |
| float2 uv = float2(wh) / iResolution; | |
| // Center coordinates of the screen with offset | |
| float2 center = iResolution / 2.0 + offset; | |
| // Calculate the distance from the current pixel to the center | |
| float distance = length(uv - center); | |
| // Calculate the ripple effect using sine function with parameters | |
| float ripple = sin(distance * freq) * amp; | |
| // Offset the texture coordinate based on the ripple effect | |
| float2 tc = (uv + ripple) * iResolution; | |
| if (tc.x > iResolution.x || tc.y > iResolution.y || tc.x < 0.0 || tc.y < 0.0) { | |
| return float4(0.0, 0.0, 0.0, 0.0); | |
| } | |
| float4 color = float4(texture_eval(nr, tc)) / 255.0; | |
| return color; | |
| } | |
| uint32_t shader(int2 wh, RMatrix<int> nr, const vvd& uniforms) { | |
| return int4_to_icol(frag(wh, nr, uniforms[0][0], uniforms[1][0], float2(uniforms[2][0], uniforms[2][1])) * 255.0); | |
| } | |
| // [[Rcpp::export]] | |
| Rcpp::IntegerVector test_ripple(Rcpp::IntegerMatrix nr, Rcpp::List uni) { | |
| const std::vector<double>& freq = uni["freq"]; | |
| const std::vector<double>& amp = uni["amp"]; | |
| const std::vector<double>& offset = uni["offset"]; | |
| const vvd uniforms = {freq, amp, offset}; | |
| return vectorize_shader(shader)(nr, uniforms); | |
| } | |
| }") | |
| library(ggplot2) | |
| library(ggfx) | |
| ripple <- \(x, freq = 64.0, amp = .01, offset = c(0, 0)) { | |
| vp <- get_viewport_area(x) | |
| vp <- test_ripple(vp, list(freq = freq, amp = amp, offset = offset)) | |
| set_viewport_area(x, vp) | |
| } | |
| ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) + | |
| with_custom( | |
| geom_step(aes(colour = Species, group = Species), linewidth = 1.25), | |
| filter = ripple, | |
| freq = 24, | |
| amp = .00456, | |
| offset = c(0, 0) | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| dat <- | |
| readr::read_csv("data/lower_care.csv") |> | |
| dplyr::mutate( | |
| issue = factor(issue), | |
| speaker = factor(speaker), | |
| speakerGroup = factor(speakerGroup), | |
| speech = stringi::stri_replace_all_regex(speech, "\\s+", "") |> | |
| stringi::stri_trans_nfkc() | |
| ) | |
| # 名詞 | |
| toks <- dat |> | |
| dplyr::select(speechID, speech, speaker, speakerGroup) |> | |
| gibasa::tokenize( | |
| speech, | |
| speechID, | |
| sys_dic = here::here("sudachidict/build") | |
| ) |> | |
| gibasa::prettify( | |
| into = gibasa::get_dict_features("sudachi"), | |
| col_select = c("POS1", "normalized_form", "reading_form") | |
| ) |> | |
| dplyr::mutate( | |
| reading_form = stringi::stri_replace_all_regex( | |
| reading_form, | |
| "[ァィゥェォャュョ]", | |
| "" | |
| ), | |
| mora = stringi::stri_length(reading_form), | |
| .by = doc_id | |
| ) | |
| noun <- toks |> | |
| dplyr::reframe( | |
| tok = gibasa::ngram_tokenizer(3)(token, sep = "-"), | |
| pos = gibasa::ngram_tokenizer(3)(POS1, sep = "-"), | |
| .by = doc_id | |
| ) |> | |
| dplyr::filter( | |
| pos == "名詞-助詞-名詞", | |
| stringi::stri_detect_fixed(tok, "-の-") | |
| ) |> | |
| dplyr::distinct(tok) |> | |
| dplyr::pull(tok) |> | |
| stringi::stri_extract_first_regex("^([[:alpha:]]+)") | |
| meishi <- toks |> | |
| dplyr::count(doc_id, token, mora) |> | |
| tidytext::bind_tf_idf(token, doc_id, n) |> | |
| dplyr::filter(token %in% noun, mora %in% c(2, 3, 4, 6)) |> | |
| dplyr::filter(dplyr::percent_rank(tf_idf) > .80) |> | |
| dplyr::distinct(token, mora) | |
| readr::write_csv( | |
| meishi, | |
| "vocab_n.csv" | |
| ) | |
| # 述部 | |
| ch <- | |
| stringi::stri_split_boundaries( | |
| dat$speech, | |
| opts_brkiter = stringi::stri_opts_brkiter( | |
| locale = "ja@ld=auto;lw=phrase" | |
| ) | |
| ) |> | |
| purrr::map( | |
| ~ stringi::stri_subset_regex(., "。") | |
| ) |> | |
| unlist() |> | |
| unique() |> | |
| stringi::stri_replace_all_fixed("。", "") |> | |
| purrr::keep(~ stringi::stri_length(.) > 2) | |
| len <- | |
| gibasa::tokenize(ch, sys_dic = here::here("sudachidict/build")) |> | |
| gibasa::prettify( | |
| into = gibasa::get_dict_features("sudachi"), | |
| col_select = "reading_form" | |
| ) |> | |
| gibasa::as_tokens(reading_form, pos_field = NULL) |> | |
| purrr::map( | |
| ~ stringi::stri_replace_all_fixed(., "ァィゥェォャュョ.", "") |> | |
| stringi::stri_length() |> | |
| sum() | |
| ) |> | |
| unlist() | |
| jutsubu <- | |
| tibble::tibble( | |
| word = ch, | |
| mora = len | |
| ) |> | |
| dplyr::filter( | |
| !gibasa::is_blank(word), | |
| mora > 4, mora < 8 | |
| ) |> | |
| dplyr::arrange(mora) | |
| readr::write_csv( | |
| jutsubu, | |
| "vocab_j.csv" | |
| ) | |
| # 修飾部 | |
| shushoku <- toks |> | |
| dplyr::filter( | |
| POS1 %in% c("形容詞") | |
| ) |> | |
| dplyr::distinct(normalized_form, .keep_all = TRUE) |> | |
| dplyr::select(token, mora) |> | |
| dplyr::arrange(mora) | |
| readr::write_csv( | |
| shushoku, | |
| "vocab_s.csv" | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| dat <- | |
| readxl::read_xlsx( | |
| "data/39.xlsx", | |
| col_names = c( | |
| "title", "artist", "lyricist", "composer", | |
| "leading", "link", "source", "text", | |
| "release", "view" | |
| ), | |
| na = "NA" | |
| ) |> | |
| dplyr::reframe( | |
| text = stringi::stri_split_regex(text, "[\\s]{2,}") |> | |
| unlist() |> | |
| stringi::stri_replace_all_regex("[\\n]+", "。") |> | |
| stringi::stri_replace_all_regex("[「」()\\(\\)]", "") |> | |
| stringi::stri_trans_tolower(), | |
| .by = link | |
| ) | |
| # 名詞 | |
| toks <- dat |> | |
| gibasa::tokenize( | |
| text, | |
| link, | |
| sys_dic = here::here("sudachidict/build") | |
| ) |> | |
| gibasa::prettify( | |
| into = gibasa::get_dict_features("sudachi"), | |
| col_select = c("POS1", "normalized_form", "reading_form") | |
| ) |> | |
| dplyr::mutate( | |
| reading_form = stringi::stri_replace_all_regex( | |
| reading_form, | |
| "[ァィゥェォャュョ]", | |
| "" | |
| ), | |
| mora = stringi::stri_length(reading_form), | |
| .by = doc_id | |
| ) | |
| noun <- toks |> | |
| dplyr::reframe( | |
| tok = gibasa::ngram_tokenizer(3)(token, sep = "-"), | |
| pos = gibasa::ngram_tokenizer(3)(POS1, sep = "-"), | |
| .by = doc_id | |
| ) |> | |
| dplyr::filter( | |
| pos == "名詞-助詞-名詞", | |
| stringi::stri_detect_fixed(tok, "-の-") | |
| ) |> | |
| dplyr::distinct(tok) |> | |
| dplyr::pull(tok) |> | |
| stringi::stri_extract_first_regex("^([[:alpha:]]+)") | |
| meishi <- toks |> | |
| dplyr::filter(token %in% noun, mora %in% c(2, 3, 4, 6)) |> | |
| dplyr::distinct(token, mora) | |
| readr::write_csv( | |
| meishi, | |
| "vocab_n.csv" | |
| ) | |
| # 述部 | |
| ch <- | |
| stringi::stri_split_boundaries( | |
| dat$text, | |
| opts_brkiter = stringi::stri_opts_brkiter( | |
| locale = "ja@ld=auto;lw=phrase" | |
| ) | |
| ) |> | |
| purrr::map( | |
| ~ stringi::stri_subset_regex(., "。") | |
| ) |> | |
| unlist() |> | |
| unique() |> | |
| stringi::stri_replace_all_fixed("。", "") |> | |
| purrr::keep(~ stringi::stri_length(.) > 2) | |
| len <- | |
| gibasa::tokenize(ch, sys_dic = here::here("sudachidict/build")) |> | |
| gibasa::prettify( | |
| into = gibasa::get_dict_features("sudachi"), | |
| col_select = "reading_form" | |
| ) |> | |
| gibasa::as_tokens(reading_form, pos_field = NULL) |> | |
| purrr::map( | |
| ~ stringi::stri_replace_all_fixed(., "ァィゥェォャュョ.", "") |> | |
| stringi::stri_length() |> | |
| sum() | |
| ) |> | |
| unlist() | |
| jutsubu <- | |
| tibble::tibble( | |
| word = ch, | |
| mora = len | |
| ) |> | |
| dplyr::filter( | |
| !gibasa::is_blank(word), | |
| mora > 2, mora < 8 | |
| ) |> | |
| dplyr::arrange(mora) | |
| readr::write_csv( | |
| jutsubu, | |
| "vocab_j.csv" | |
| ) | |
| # 修飾部 | |
| shushoku <- toks |> | |
| dplyr::filter( | |
| POS1 %in% c("形容詞", "副詞") | |
| ) |> | |
| dplyr::distinct(normalized_form, .keep_all = TRUE) |> | |
| dplyr::select(token, mora) |> | |
| dplyr::arrange(mora) | |
| readr::write_csv( | |
| shushoku, | |
| "vocab_s.csv" | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #' Environment for internal use | |
| #' | |
| #' @noRd | |
| #' @keywords internal | |
| .env <- rlang::env(server i= NULL) | |
| #' @noRd | |
| reset_encoding <- function(chr, encoding = "UTF-8") { | |
| Encoding(chr) <- encoding | |
| return(chr) | |
| } | |
| #' Call lattice command | |
| #' | |
| #' A debug tool of tokenize process outputs a lattice in graphviz dot format. | |
| #' | |
| #' @param sentences Character vector. | |
| #' @param ... Other arguments are passed to \code{DiagrammeR::grViz}. | |
| #' | |
| #' @export | |
| get_lattice <- function(sentences, ...) { | |
| senteces <- stringi::stri_omit_na(sentences) | |
| dot <- processx::run("kagome", c("lattice", stringi::stri_c(sentences, collapse = " ")))$stdout | |
| DiagrammeR::grViz(reset_encoding(dot), ...) | |
| } | |
| #' Lanch Kagome Server | |
| #' | |
| #' Start or kill Kagome server process. | |
| #' @param dict Dictionary which kagome server uses. Default value is 'ipa' (IPA-dictionary). | |
| #' @return Logical value (whether or not the kagome server is alive?) is returned invisibly. | |
| #' | |
| #' @export | |
| lanch_server <- function(dict = c("ipa", "uni")) { | |
| p <- rlang::env_get(.env, "server") | |
| if (is.null(p)) { | |
| dict <- rlang::arg_match(dict) | |
| p <- processx::process$new("kagome", c("server", "-dict", dict)) | |
| rlang::env_bind(.env, server = p) | |
| } else { | |
| if (p$is_alive()) { | |
| kill <- yesno::yesno("Kagome server is already alive. Do you want to kill its process?") | |
| } | |
| if (kill) { | |
| p$kill() | |
| rlang::env_bind(.env, server = NULL) | |
| } | |
| } | |
| return(invisible(p$is_alive())) | |
| } | |
| #' Send a HEAD request to Kagome server | |
| #' | |
| #' @param url URL Character scalar. | |
| #' @return httr2 response is returned invisibly. | |
| #' | |
| #' @export | |
| ping <- function(url = "http://localhost:6060") { | |
| resp <- | |
| httr2::request(url) |> | |
| httr2::req_method("HEAD") |> | |
| httr2::req_perform() | |
| return(invisible(resp)) | |
| } | |
| #' Put request to tokenize API | |
| #' | |
| #' @param sentences Character vector to be analyzed. | |
| #' @param url URL of Kagome server. | |
| #' @param mode One of `normal`, `search` or `extended`. | |
| #' @return tibble | |
| #' | |
| #' @export | |
| tokenize <- function(sentences, | |
| url = "http://localhost:6060/tokenize", | |
| mode = c("normal", "search", "extended")) { | |
| mode <- rlang::arg_match(mode) | |
| sentences <- | |
| stringi::stri_omit_na(sentences) |> | |
| stringi::stri_split_boundaries(type = "sentence") |> | |
| purrr::flatten_chr() | |
| resps <- | |
| furrr::future_imap_dfr(sentences, ~ tokenize_impl(.x, .y, url, mode)) |> | |
| dplyr::mutate(across(where(is.character), ~ dplyr::na_if(., "*"))) |> | |
| dplyr::mutate_at(c("doc_id", "class"), as.factor) |> | |
| tibble::as_tibble() | |
| return(resps) | |
| } | |
| #' @noRd | |
| tokenize_impl <- function(msg, idx, url = "http://localhost:6060/tokenize", mode = "normal") { | |
| resp <- | |
| httr2::request(url) |> | |
| httr2::req_body_json(list( | |
| sentence = msg, | |
| mode = mode | |
| )) |> | |
| httr2::req_method("PUT") |> | |
| httr2::req_error(function(resp) httr2::resp_status(resp) > 400) |> | |
| httr2::req_perform() | |
| return( | |
| purrr::map_dfr( | |
| httr2::resp_body_json(resp)$tokens, | |
| ~ data.frame( | |
| doc_id = idx, | |
| id = .$id, | |
| start = .$start, | |
| end = .$end, | |
| class = .$class, | |
| token = .$surface, | |
| POS1 = .$pos[[1]], | |
| POS2 = .$pos[[2]], | |
| POS3 = .$pos[[3]], | |
| POS4 = .$pos[[4]], | |
| Original = .$base_form, | |
| Yomi1 = .$reading, | |
| Yomi2 = .$pronunciation | |
| ) | |
| ) | |
| ) | |
| } | |
| .on_unload <- function(ns) { | |
| p <- rlang::env_get(.env, "server") | |
| if (!is.null(p) && p$is_alive()) { | |
| p$kill() | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| texts <- ldccr::read_aozora( | |
| "https://www.aozora.gr.jp/cards/000035/files/1567_ruby_4948.zip", | |
| directory = tempdir() | |
| ) | |
| sent <- readr::read_lines(texts) |> | |
| gibasa::tokenize(split = TRUE) |> | |
| gibasa::prettify() |> | |
| dplyr::filter(!POS1 %in% c("記号")) |> | |
| dplyr::group_by(doc_id, sentence_id) |> | |
| dplyr::reframe( | |
| text = paste0(token, collapse = " ") | |
| ) |> | |
| dplyr::mutate( | |
| doc_id = doc_id, | |
| sentence_id = paste(doc_id, sentence_id, sep = "_") | |
| ) | |
| ### lexRankr ---- | |
| tok <- sent |> | |
| tidytext::unnest_tokens( | |
| token, | |
| text, | |
| token = \(x) strsplit(x, " ", fixed = TRUE) | |
| ) | |
| simil_df <- lexRankr::sentenceSimil( | |
| sentenceId = tok$sentence_id, | |
| token = tok$token, | |
| docId = tok$doc_id | |
| ) | |
| top_n_sent <- | |
| lexRankr::lexRankFromSimil( | |
| simil_df$sent1, | |
| simil_df$sent2, | |
| simil = simil_df$similVal, | |
| n = 10, | |
| usePageRank = TRUE, | |
| continuous = TRUE, | |
| # threshold = 0.1, | |
| returnTies = FALSE | |
| ) | |
| top_n_sent |> | |
| dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) |> | |
| dplyr::arrange(desc(value)) |> | |
| dplyr::as_tibble() | |
| ### replace sentenceSimil ---- | |
| mat <- | |
| sent |> | |
| tidytext::unnest_tokens( | |
| term, | |
| text, | |
| token = \(x) strsplit(x, " ", fixed = TRUE) | |
| ) |> | |
| dplyr::count(doc_id, term) |> | |
| tidytext::bind_tf_idf(term, doc_id, n) |> | |
| dplyr::mutate(tf_idf = n * (idf + 1)) |> | |
| dplyr::inner_join( | |
| sent |> | |
| tidytext::unnest_tokens( | |
| term, | |
| text, | |
| token = \(x) strsplit(x, " ", fixed = TRUE) | |
| ) |> | |
| dplyr::select(doc_id, sentence_id, term), | |
| by = c("doc_id" = "doc_id", "term" = "term") | |
| ) |> | |
| tidytext::cast_sparse(sentence_id, term, tf_idf) | |
| dt <- proxyC::simil( | |
| mat, | |
| method = "cosine", | |
| # min_simil = 0.1, | |
| # rank = 50, | |
| use_nan = FALSE | |
| ) | |
| dt |> | |
| tidytext:::tidy.dfm() |> | |
| dplyr::rename( | |
| s1 = document, | |
| s2 = term, | |
| weight = count | |
| ) |> | |
| (\(simil_df) { | |
| lexRankr::lexRankFromSimil( | |
| simil_df$s1, | |
| simil_df$s2, | |
| simil = simil_df$weight, | |
| n = nrow(simil_df), | |
| usePageRank = TRUE, | |
| continuous = TRUE, | |
| # threshold = 0.1, | |
| returnTies = TRUE | |
| ) | |
| })() |> | |
| dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) |> | |
| dplyr::arrange(desc(value)) |> | |
| dplyr::as_tibble() | |
| ### replace lexRankFromSimil ---- | |
| dt |> | |
| tidytext:::tidy.dfm() |> | |
| dplyr::rename( | |
| s1 = document, | |
| s2 = term, | |
| weight = count | |
| ) |> | |
| (\(simil_df) { | |
| pr <- simil_df |> | |
| igraph::graph_from_data_frame( | |
| directed = FALSE | |
| ) |> | |
| igraph::page_rank( | |
| directed = FALSE, | |
| damping = .85 | |
| ) |> | |
| purrr::pluck("vector") | |
| tibble::tibble( | |
| sentenceId = names(pr), | |
| value = unname(pr) | |
| ) | |
| })() |> | |
| dplyr::inner_join(sent, by = c("sentenceId" = "sentence_id")) |> | |
| dplyr::arrange(desc(value)) |> | |
| dplyr::as_tibble() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| texts <- ldccr::read_aozora( | |
| "https://www.aozora.gr.jp/cards/000035/files/1567_ruby_4948.zip", | |
| directory = tempdir() | |
| ) | |
| sent <- readr::read_lines(texts) |> | |
| gibasa::tokenize() |> | |
| gibasa::prettify() |> | |
| dplyr::mutate(token = dplyr::if_else(is.na(Original), token, Original)) |> | |
| dplyr::reframe( | |
| token = gibasa::ngram_tokenizer(2)(token, sep = "-"), | |
| pos = gibasa::ngram_tokenizer(2)(POS1, sep = "-"), | |
| .by = doc_id | |
| ) |> | |
| dplyr::count(doc_id, token, pos) |> | |
| gibasa::bind_tf_idf2( | |
| token, doc_id, n, | |
| tf = "tf2", idf = "idf3", norm = TRUE | |
| ) |> | |
| dplyr::filter(!stringr::str_detect(pos, "(助詞)|(記号)")) |> | |
| dplyr::summarise( | |
| value = sum(tf_idf) |> round(digits = 2), | |
| .by = token | |
| ) |> | |
| tidyr::separate_wider_delim( | |
| token, | |
| delim = "-", | |
| names = c("source", "target") | |
| ) |> | |
| dplyr::slice_max(value, n = 40) | |
| nodes <- readr::read_lines(texts) |> | |
| gibasa::tokenize() |> | |
| gibasa::prettify() |> | |
| dplyr::mutate(token = dplyr::if_else(is.na(Original), token, Original)) |> | |
| dplyr::filter(token %in% unique(c(sent$source, sent$target))) |> | |
| dplyr::count(token) |> | |
| dplyr::transmute( | |
| id = token, | |
| value = n, | |
| cluster = sent |> | |
| igraph::graph_from_data_frame() |> | |
| igraph::cluster_label_prop() |> | |
| igraph::membership() |> | |
| rlang::as_function(~ .[id])() |> | |
| unname() |> | |
| factor() |> | |
| forcats::fct_lump(n = 5, other_level = "0") |> | |
| (\(f) { | |
| forcats::fct_relabel(f, | |
| ~ scales::viridis_pal(alpha = .8, option = "H")(nlevels(f)) | |
| ) | |
| })() | |
| ) | |
| jsonlite::write_json( | |
| nodes, | |
| file.path(here::here(), "src/components/data/nodes.json") | |
| ) | |
| jsonlite::write_json( | |
| sent, | |
| file.path(here::here(), "src/components/data/edges.json") | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require(sudachir) | |
| require(apportita) | |
| strj_split_boundaries <- \(x) { | |
| stringi::stri_split_boundaries( | |
| x, | |
| opts_brkiter = stringi::stri_opts_brkiter( | |
| locale = "ja@lw=phrase;ld=auto" # auto | loose | normal | strict | anywhere | |
| ) | |
| ) | |
| } | |
| split_kugire <- \(x) { | |
| strj_split_boundaries(x) |> | |
| purrr::map(\(elem) { | |
| len <- length(elem) | |
| if (len < 2) { | |
| return(NA_character_) | |
| } else { | |
| sapply(seq_len(len - 1), \(i) { | |
| s1 <- paste0(elem[1:i], collapse = "") | |
| s2 <- paste0(elem[(i + 1):len], collapse = "") | |
| paste(s1, s2, sep = "\t") | |
| }) | |
| } | |
| }) |> | |
| unlist() | |
| } | |
| sudachi <- sudachir::rebuild_tokenizer(mode = "C") | |
| form <- \(x) { | |
| unlist(sudachir::form(x, type = "normalized", pos = FALSE, | |
| instance = sudachi)) | |
| } | |
| wrd <- \(conn, s1, s2) { | |
| purrr::map2_dbl(s1, s2, \(el1, el2) { | |
| el1 <- form(el1) | |
| el2 <- form(el2) | |
| apportita::calc_wrd(conn, el1, el2) | |
| }) | |
| } | |
| conn <- magnitude("models/magnitude/chive-1.2-mc90.magnitude") | |
| dim(conn) | |
| ### tanka72 ---- | |
| dat <- | |
| readxl::read_xlsx("data/tanka/tanka72.xlsx") |> | |
| dplyr::transmute( | |
| id = factor(id), | |
| body = audubon::strj_normalize(body) |> | |
| stringr::str_remove_all("[^[:alnum:]]+"), | |
| author = factor(author) | |
| ) | |
| ret <- dat |> | |
| dplyr::reframe( | |
| id = id, | |
| phrase = split_kugire(body), | |
| author = author, | |
| .by = id | |
| ) |> | |
| tidyr::separate_wider_delim( | |
| phrase, delim = "\t", names = c("s1", "s2") | |
| ) | |
| ret <- ret |> | |
| dplyr::mutate( | |
| wrd = wrd(conn, s1, s2) | |
| ) | |
| arrow::write_parquet(ret, "tanka72-wrd.parquet") | |
| ### tweets ---- | |
| tweets <- | |
| readr::read_csv( | |
| "data/shinabitanori-230622.csv.gz", | |
| col_names = c("id", "time", "body"), | |
| col_types = "ccc" | |
| ) |> | |
| dplyr::filter( | |
| !stringr::str_detect(body, "@|(RT)"), | |
| stringr::str_length(body) > 14, | |
| stringr::str_length(body) < 30 | |
| ) |> | |
| dplyr::slice_sample(n = 100) |> | |
| dplyr::transmute( | |
| id = factor(id), | |
| body = audubon::strj_normalize(body) |> | |
| ldccr::clean_url() |> | |
| stringr::str_remove_all("[^[:alnum:]]+") | |
| ) |> | |
| dplyr::filter( | |
| !stringi::stri_isempty(body) | |
| ) | |
| tweets <- tweets |> | |
| dplyr::reframe( | |
| id = id, | |
| phrase = split_kugire(body), | |
| .by = id | |
| ) |> | |
| tidyr::separate_wider_delim( | |
| phrase, delim = "\t", names = c("s1", "s2") | |
| ) | |
| tweets <- tweets |> | |
| dplyr::mutate( | |
| wrd = wrd(conn, s1, s2) | |
| ) | |
| arrow::write_parquet(tweets, "shinabitanori-wrd.parquet") | |
| close(conn) | |
| rm(conn) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require(sudachir) | |
| require(apportita) | |
| strj_split_boundaries <- \(x) { | |
| stringi::stri_split_boundaries( | |
| x, | |
| opts_brkiter = stringi::stri_opts_brkiter( | |
| locale = "ja@lw=phrase;ld=auto" # auto | loose | normal | strict | anywhere | |
| ) | |
| ) | |
| } | |
| split_kugire <- \(x) { | |
| strj_split_boundaries(x) |> | |
| purrr::map(\(elem) { | |
| len <- length(elem) | |
| if (len < 2) { | |
| return(NA_character_) | |
| } else { | |
| sapply(seq_len(len - 1), \(i) { | |
| s1 <- paste0(elem[1:i], collapse = "") | |
| s2 <- paste0(elem[(i + 1):len], collapse = "") | |
| paste(s1, s2, sep = "\t") | |
| }) | |
| } | |
| }) |> | |
| unlist() | |
| } | |
| sudachi <- sudachir::rebuild_tokenizer(mode = "C") | |
| form <- \(x) { | |
| unlist(sudachir::form(x, type = "normalized", pos = FALSE, | |
| instance = sudachi)) | |
| } | |
| wrd <- \(conn, s1, s2) { | |
| purrr::map2_dbl(s1, s2, \(el1, el2) { | |
| el1 <- form(el1) | |
| el2 <- form(el2) | |
| tryCatch( | |
| apportita::calc_wrd(conn, el1, el2), | |
| error = \(e) { | |
| NA | |
| } | |
| ) | |
| }, .progress = TRUE) | |
| } | |
| conn <- magnitude("models/magnitude/chive-1.2-mc90.magnitude") | |
| dim(conn) | |
| ### tanka ---- | |
| dat <- | |
| readxl::read_excel( | |
| "data/tanka/1001-1500_poems.xlsx", | |
| col_names = c("id", "poems", "names", "loves", "likes", "keys1", "keys2") | |
| ) |> | |
| # dplyr::slice_sample(n = 20) |> | |
| dplyr::transmute( | |
| id = id, | |
| names = stringr::str_remove_all(names, "[(<U\\+[A-Z]>)]") |> | |
| audubon::strj_normalize() |> | |
| stringr::str_remove_all("[^[:alnum:]]+"), | |
| body = stringr::str_remove_all(poems, "[(<U\\+[A-Z]>)]") |> | |
| audubon::strj_normalize() |> | |
| stringr::str_remove_all("[^[:alnum:]]+") | |
| ) |> | |
| dplyr::filter( | |
| !stringi::stri_isempty(body) | |
| ) | |
| dat <- dat |> | |
| dplyr::reframe( | |
| id = id, | |
| phrase = split_kugire(body), | |
| .by = id | |
| ) |> | |
| tidyr::separate_wider_delim( | |
| phrase, delim = "\t", names = c("s1", "s2") | |
| ) | |
| arrow::write_parquet(dat, "tanka-kugire.parquet") | |
| print("区切り処理終わり") | |
| dat <- dat |> | |
| tidyr::drop_na() |> | |
| dplyr::mutate( | |
| wrd = wrd(conn, s1, s2) | |
| ) | |
| arrow::write_parquet(dat, "tanka-wrd.parquet") | |
| close(conn) | |
| rm(conn) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| clean_url <- \(text) { | |
| pat <- "(https?|ftp)://([[a-zA-Z0-9]-]+\\.)+[[a-zA-Z0-9]-]+(/[[a-zA-Z0-9]- ./?%&=~]*)?" | |
| stringr::str_replace_all(text, pattern = pat, replacement = "URL") | |
| } | |
| to_df <- \(jsonlist) { | |
| purrr::map(jsonlist, \(elem) { | |
| text <- elem$rendered_body |> | |
| rvest::read_html() |> | |
| rvest::html_elements("p") |> | |
| rvest::html_text() |> | |
| paste(collapse = "\n") | |
| tags <- elem$tags |> | |
| purrr::map_chr(~ .x$name) |> | |
| paste(collapse = ",") | |
| tibble::tibble( | |
| "doc_id" = elem$id, | |
| "created" = elem$created_at, | |
| "updated" = elem$updated_at, | |
| "author" = elem$user$permanent_id, | |
| "title" = audubon::strj_normalize(elem$title), | |
| "text" = audubon::strj_normalize(text), | |
| "tags" = tags, | |
| "comments" = elem$comments_count, | |
| "likes" = elem$likes_count, | |
| "reactions" = elem$reactions_count, | |
| "stocks" = elem$stocks_count) | |
| }) |> | |
| purrr::list_rbind() |> | |
| dplyr::filter( | |
| cld3::detect_language(text) == "ja" | |
| ) |> | |
| dplyr::mutate( | |
| created = lubridate::as_date(created), | |
| updated = lubridate::as_date(updated) | |
| ) | |
| } | |
| PAGES <- ceiling(4642 / 100) | |
| df <- seq_len(PAGES) |> | |
| purrr::map(\(i) { | |
| li <- qiitr::qiita_get_items( | |
| tag_id = "r", | |
| per_page = 100, # range: 1-100 | |
| page_offset = (i - 1), # 1 + page_offset < 100 | |
| page_limit = 1 | |
| ) | |
| if (i %% 16 == 0) { | |
| rlang::inform(sprintf("Sleeping 16 minutes; currently took %dth page.", i)) | |
| Sys.sleep(60 * 16) | |
| } | |
| return(to_df(li)) | |
| }, .progress = TRUE) |> | |
| purrr::list_rbind() | |
| arrow::write_parquet(df, here::here("data/qiita.parquet")) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| tbl <- readxl::read_xlsx("Downloads/koe.xlsx") |> | |
| tibble::rowid_to_column(var = "doc_id") |> | |
| dplyr::mutate(across(where(is.character), ~ dplyr::na_if(., "NA"))) |> | |
| tidyr::drop_na() | |
| tbl |> | |
| dplyr::select(Region, Sex, Age, Satis) |> | |
| DataExplorer::plot_bar() | |
| tbl <- tbl |> | |
| dplyr::mutate( | |
| doc_id = factor(doc_id), | |
| Region = factor(Region), | |
| Sex = factor(Sex), | |
| Age = stringi::stri_trans_nfkc(Age), | |
| Age = dplyr::case_match( | |
| Age, | |
| c("10代", "20代") ~ "20", | |
| "30代" ~ "30", | |
| "40代" ~ "40", | |
| "50代" ~ "50", | |
| c("60代", "70代") ~ "60" | |
| ), | |
| Age = factor(Age), | |
| Satis = stringr::str_detect(Satis, "満足"), | |
| Opinion = audubon::strj_normalize(Opinion) |> | |
| stringr::str_replace_all("[[:number:]]+", "N") | |
| ) | |
| summary(tbl) | |
| df <- tbl |> | |
| dplyr::select(!Region) |> | |
| gibasa::tokenize(Opinion) |> | |
| gibasa::prettify(col_select = c("POS1", "POS2", "Original")) |> | |
| dplyr::mutate(token = dplyr::if_else(is.na(Original), token, Original)) |> | |
| gibasa::mute_tokens( | |
| (!POS1 %in% c("名詞", "形容詞")) | | |
| (token %in% c(stopwords::stopwords("ja", "marimo"), "沖縄", "観光", "旅行")) | |
| ) |> | |
| gibasa::pack() | |
| corp <- df |> | |
| dplyr::right_join( | |
| tbl |> | |
| dplyr::mutate(Attr = paste(Age, Sex, sep = "_"), .keep = "unused"), | |
| by = "doc_id" | |
| ) |> | |
| quanteda::corpus() | |
| require(ca) | |
| cam <- corp |> | |
| quanteda::tokens(what = "fastestword") |> | |
| quanteda::dfm() |> | |
| quanteda::dfm_group(groups = quanteda::docvars(corp, "Attr")) |> | |
| quanteda::dfm_trim(min_termfreq = 10, max_termfreq = 50) |> | |
| quanteda.textmodels::textmodel_ca() | |
| plot(cam) | |
| require(ggplot2) | |
| dplyr::bind_rows( | |
| as.data.frame(cam$rowcoord), | |
| as.data.frame(cam$colcoord) | |
| ) |> | |
| tibble::rownames_to_column() |> | |
| dplyr::mutate(shape = stringr::str_detect(rowname, "_")) |> | |
| ggplot(aes(Dim1, Dim2, shape = shape, colour = shape)) + | |
| geom_vline(xintercept = 0, colour = "grey", linetype = "dashed") + | |
| geom_hline(yintercept = 0, colour = "grey", linetype = "dashed") + | |
| geom_point() + | |
| ggrepel::geom_text_repel(aes(label = rowname), max.overlaps = 20) + | |
| theme_light() + | |
| theme(legend.position = "none") | |
| df <- tbl |> | |
| dplyr::select(!Region) |> | |
| gibasa::tokenize(Opinion) |> | |
| gibasa::prettify(col_select = c("POS1", "POS2", "Original")) |> | |
| dplyr::mutate(token = dplyr::if_else(is.na(Original), token, Original)) |> | |
| gibasa::mute_tokens( | |
| (!POS1 %in% c("名詞", "形容詞")) | | |
| (token %in% c(stopwords::stopwords("ja", "marimo"), "沖縄", "観光", "旅行")) | |
| ) |> | |
| dtplyr::lazy_dt() |> | |
| dplyr::group_by(doc_id) |> | |
| dplyr::mutate(term1 = token, term2 = dplyr::lead(term1)) |> | |
| dplyr::add_count(term1, term2, name = "cooc") |> | |
| dplyr::ungroup() |> | |
| dplyr::select(doc_id, token, term1, term2, cooc) | |
| biterms <- df |> | |
| dplyr::select(doc_id, term1, term2, cooc) |> | |
| tidyr::drop_na() |> | |
| dplyr::as_tibble() | |
| model <- df |> | |
| dplyr::select(doc_id, token) |> | |
| dplyr::as_tibble() |> | |
| BTM::BTM(k = 5, background = TRUE, biterms = biterms) | |
| plot(model, top_n = 20) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # R Benchmark 2.5 (06/2008) [Simon Urbanek] | |
| # version 2.5: scaled to get roughly 1s per test, R 2.7.0 @ 2.6GHz Mac Pro | |
| # R Benchmark 2.4 (06/2008) [Simon Urbanek] | |
| # version 2.4 adapted to more recent Matrix package | |
| # R Benchmark 2.3 (21 April 2004) | |
| # Warning: changes are not carefully checked yet! | |
| # version 2.3 adapted to R 1.9.0 | |
| # Many thanks to Douglas Bates ([email protected]) for improvements! | |
| # version 2.2 adapted to R 1.8.0 | |
| # version 2.1 adapted to R 1.7.0 | |
| # version 2, scaled to get 1 +/- 0.1 sec with R 1.6.2 | |
| # using the standard ATLAS library (Rblas.dll) | |
| # on a Pentium IV 1.6 Ghz with 1 Gb Ram on Win XP pro | |
| # revised and optimized for R v. 1.5.x, 8 June 2002 | |
| # Requires additionnal libraries: Matrix, SuppDists | |
| # Author : Philippe Grosjean | |
| # eMail : [email protected] | |
| # Web : http://www.sciviews.org | |
| # License: GPL 2 or above at your convenience (see: http://www.gnu.org) | |
| # | |
| # Several tests are adapted from the Splus Benchmark Test V. 2 | |
| # by Stephan Steinhaus ([email protected]) | |
| # Reference for Escoufier's equivalents vectors (test III.5): | |
| # Escoufier Y., 1970. Echantillonnage dans une population de variables | |
| # aleatoires r?elles. Publ. Inst. Statis. Univ. Paris 19 Fasc 4, 1-47. | |
| # | |
| # type source("c:/<dir>/R2.R") to start the test | |
| runs <- 3 # Number of times the tests are executed | |
| times <- rep(0, 15); dim(times) <- c(5,3) | |
| require(Matrix) # Optimized matrix operations | |
| #require(SuppDists) # Optimized random number generators | |
| #Runif <- rMWC1019 # The fast uniform number generator | |
| Runif <- runif | |
| # If you don't have SuppDists, you can use: Runif <- runif | |
| #a <- rMWC1019(10, new.start=TRUE, seed=492166) # Init. the generator | |
| #Rnorm <- rziggurat # The fast normal number generator | |
| # If you don't have SuppDists, you can use: Rnorm <- rnorm | |
| #b <- rziggurat(10, new.start=TRUE) # Init. the generator | |
| Rnorm <- rnorm | |
| #remove("a", "b") | |
| options(object.size=100000000) | |
| cat("\n\n R Benchmark 2.5\n") | |
| cat(" ===============\n") | |
| cat(c("Number of times each test is run__________________________: ", runs)) | |
| cat("\n\n") | |
| cat(" I. Matrix calculation\n") | |
| cat(" ---------------------\n") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| # (1) | |
| cumulate <- 0; a <- 0; b <- 0 | |
| for (i in 1:runs) { | |
| invisible(gc()) | |
| timing <- system.time({ | |
| a <- matrix(Rnorm(2500*2500)/10, ncol=2500, nrow=2500); | |
| b <- t(a); | |
| dim(b) <- c(1250, 5000); | |
| a <- t(b) | |
| })[3] | |
| cumulate <- cumulate + timing | |
| } | |
| timing <- cumulate/runs | |
| times[1, 1] <- timing | |
| cat(c("Creation, transp., deformation of a 2500x2500 matrix (sec): ", timing, "\n")) | |
| remove("a", "b") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| # (2) | |
| cumulate <- 0; b <- 0 | |
| for (i in 1:runs) { | |
| a <- abs(matrix(Rnorm(2500*2500)/2, ncol=2500, nrow=2500)); | |
| invisible(gc()) | |
| timing <- system.time({ | |
| b <- a^1000 | |
| })[3] | |
| cumulate <- cumulate + timing | |
| } | |
| timing <- cumulate/runs | |
| times[2, 1] <- timing | |
| cat(c("2400x2400 normal distributed random matrix ^1000____ (sec): ", timing, "\n")) | |
| remove("a", "b") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| # (3) | |
| cumulate <- 0; b <- 0 | |
| for (i in 1:runs) { | |
| a <- Rnorm(7000000) | |
| invisible(gc()) | |
| timing <- system.time({ | |
| b <- sort(a, method="quick") # Sort is modified in v. 1.5.x | |
| # And there is now a quick method that better competes with other packages!!! | |
| })[3] | |
| cumulate <- cumulate + timing | |
| } | |
| timing <- cumulate/runs | |
| times[3, 1] <- timing | |
| cat(c("Sorting of 7,000,000 random values__________________ (sec): ", timing, "\n")) | |
| remove("a", "b") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| # (4) | |
| cumulate <- 0; b <- 0 | |
| for (i in 1:runs) { | |
| a <- Rnorm(2800*2800); dim(a) <- c(2800, 2800) | |
| invisible(gc()) | |
| timing <- system.time({ | |
| b <- crossprod(a) # equivalent to: b <- t(a) %*% a | |
| })[3] | |
| cumulate <- cumulate + timing | |
| } | |
| timing <- cumulate/runs | |
| times[4, 1] <- timing | |
| cat(c("2800x2800 cross-product matrix (b = a' * a)_________ (sec): ", timing, "\n")) | |
| remove("a", "b") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| # (5) | |
| cumulate <- 0; c <- 0; qra <-0 | |
| for (i in 1:runs) { | |
| a <- new("dgeMatrix", x = Rnorm(2000*2000), Dim = as.integer(c(2000,2000))) | |
| b <- as.double(1:2000) | |
| invisible(gc()) | |
| timing <- system.time({ | |
| c <- solve(crossprod(a), crossprod(a,b)) | |
| })[3] | |
| cumulate <- cumulate + timing | |
| # This is the old method | |
| #a <- Rnorm(600*600); dim(a) <- c(600,600) | |
| #b <- 1:600 | |
| #invisible(gc()) | |
| #timing <- system.time({ | |
| # qra <- qr(a, tol = 1e-7); | |
| # c <- qr.coef(qra, b) | |
| # #Rem: a little faster than c <- lsfit(a, b, inter=F)$coefficients | |
| #})[3] | |
| #cumulate <- cumulate + timing | |
| } | |
| timing <- cumulate/runs | |
| times[5, 1] <- timing | |
| cat(c("Linear regr. over a 3000x3000 matrix (c = a \\ b')___ (sec): ", timing, "\n")) | |
| remove("a", "b", "c", "qra") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| times[ , 1] <- sort(times[ , 1]) | |
| cat(" --------------------------------------------\n") | |
| cat(c(" Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 1]))), "\n\n")) | |
| cat(" II. Matrix functions\n") | |
| cat(" --------------------\n") | |
| if (R.Version()$os == "Win32") flush.console() | |
| # (1) | |
| cumulate <- 0; b <- 0 | |
| for (i in 1:runs) { | |
| a <- Rnorm(2400000) | |
| invisible(gc()) | |
| timing <- system.time({ | |
| b <- fft(a) | |
| })[3] | |
| cumulate <- cumulate + timing | |
| } | |
| timing <- cumulate/runs | |
| times[1, 2] <- timing | |
| cat(c("FFT over 2,400,000 random values____________________ (sec): ", timing, "\n")) | |
| remove("a", "b") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| # (2) | |
| cumulate <- 0; b <- 0 | |
| for (i in 1:runs) { | |
| a <- array(Rnorm(600*600), dim = c(600, 600)) | |
| # Only needed if using eigen.Matrix(): Matrix.class(a) | |
| invisible(gc()) | |
| timing <- system.time({ | |
| b <- eigen(a, symmetric=FALSE, only.values=TRUE)$Value | |
| # Rem: on my machine, it is faster than: | |
| # b <- La.eigen(a, symmetric=F, only.values=T, method="dsyevr")$Value | |
| # b <- La.eigen(a, symmetric=F, only.values=T, method="dsyev")$Value | |
| # b <- eigen.Matrix(a, vectors = F)$Value | |
| })[3] | |
| cumulate <- cumulate + timing | |
| } | |
| timing <- cumulate/runs | |
| times[2, 2] <- timing | |
| cat(c("Eigenvalues of a 640x640 random matrix______________ (sec): ", timing, "\n")) | |
| remove("a", "b") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| # (3) | |
| cumulate <- 0; b <- 0 | |
| for (i in 1:runs) { | |
| a <- Rnorm(2500*2500); dim(a) <- c(2500, 2500) | |
| #Matrix.class(a) | |
| invisible(gc()) | |
| timing <- system.time({ | |
| #b <- determinant(a, logarithm=F) | |
| # Rem: the following is slower on my computer! | |
| # b <- det.default(a) | |
| b <- det(a) | |
| })[3] | |
| cumulate <- cumulate + timing | |
| } | |
| timing <- cumulate/runs | |
| times[3, 2] <- timing | |
| cat(c("Determinant of a 2500x2500 random matrix____________ (sec): ", timing, "\n")) | |
| remove("a", "b") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| # (4) | |
| cumulate <- 0; b <- 0 | |
| for (i in 1:runs) { | |
| a <- crossprod(new("dgeMatrix", x = Rnorm(3000*3000), | |
| Dim = as.integer(c(3000, 3000)))) | |
| invisible(gc()) | |
| #a <- Rnorm(900*900); dim(a) <- c(900, 900) | |
| #a <- crossprod(a, a) | |
| timing <- system.time({ | |
| b <- chol(a) | |
| })[3] | |
| cumulate <- cumulate + timing | |
| } | |
| timing <- cumulate/runs | |
| times[4, 2] <- timing | |
| cat(c("Cholesky decomposition of a 3000x3000 matrix________ (sec): ", timing, "\n")) | |
| remove("a", "b") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| # (5) | |
| cumulate <- 0; b <- 0 | |
| for (i in 1:runs) { | |
| a <- new("dgeMatrix", x = Rnorm(1600*1600), Dim = as.integer(c(1600, 1600))) | |
| invisible(gc()) | |
| #a <- Rnorm(400*400); dim(a) <- c(400, 400) | |
| timing <- system.time({ | |
| # b <- qr.solve(a) | |
| # Rem: a little faster than | |
| b <- solve(a) | |
| })[3] | |
| cumulate <- cumulate + timing | |
| } | |
| timing <- cumulate/runs | |
| times[5, 2] <- timing | |
| cat(c("Inverse of a 1600x1600 random matrix________________ (sec): ", timing, "\n")) | |
| remove("a", "b") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| times[ , 2] <- sort(times[ , 2]) | |
| cat(" --------------------------------------------\n") | |
| cat(c(" Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 2]))), "\n\n")) | |
| cat(" III. Programmation\n") | |
| cat(" ------------------\n") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| # (1) | |
| cumulate <- 0; a <- 0; b <- 0; phi <- 1.6180339887498949 | |
| for (i in 1:runs) { | |
| a <- floor(Runif(3500000)*1000) | |
| invisible(gc()) | |
| timing <- system.time({ | |
| b <- (phi^a - (-phi)^(-a))/sqrt(5) | |
| })[3] | |
| cumulate <- cumulate + timing | |
| } | |
| timing <- cumulate/runs | |
| times[1, 3] <- timing | |
| cat(c("3,500,000 Fibonacci numbers calculation (vector calc)(sec): ", timing, "\n")) | |
| remove("a", "b", "phi") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| # (2) | |
| cumulate <- 0; a <- 3000; b <- 0 | |
| for (i in 1:runs) { | |
| invisible(gc()) | |
| timing <- system.time({ | |
| b <- rep(1:a, a); dim(b) <- c(a, a); | |
| b <- 1 / (t(b) + 0:(a-1)) | |
| # Rem: this is twice as fast as the following code proposed by R programmers | |
| # a <- 1:a; b <- 1 / outer(a - 1, a, "+") | |
| })[3] | |
| cumulate <- cumulate + timing | |
| } | |
| timing <- cumulate/runs | |
| times[2, 3] <- timing | |
| cat(c("Creation of a 3000x3000 Hilbert matrix (matrix calc) (sec): ", timing, "\n")) | |
| remove("a", "b") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| # (3) | |
| cumulate <- 0; c <- 0 | |
| gcd2 <- function(x, y) {if (sum(y > 1.0E-4) == 0) x else {y[y == 0] <- x[y == 0]; Recall(y, x %% y)}} | |
| for (i in 1:runs) { | |
| a <- ceiling(Runif(400000)*1000) | |
| b <- ceiling(Runif(400000)*1000) | |
| invisible(gc()) | |
| timing <- system.time({ | |
| c <- gcd2(a, b) # gcd2 is a recursive function | |
| })[3] | |
| cumulate <- cumulate + timing | |
| } | |
| timing <- cumulate/runs | |
| times[3, 3] <- timing | |
| cat(c("Grand common divisors of 400,000 pairs (recursion)__ (sec): ", timing, "\n")) | |
| remove("a", "b", "c", "gcd2") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| # (4) | |
| cumulate <- 0; b <- 0 | |
| for (i in 1:runs) { | |
| b <- rep(0, 500*500); dim(b) <- c(500, 500) | |
| invisible(gc()) | |
| timing <- system.time({ | |
| # Rem: there are faster ways to do this | |
| # but here we want to time loops (220*220 'for' loops)! | |
| for (j in 1:500) { | |
| for (k in 1:500) { | |
| b[k,j] <- abs(j - k) + 1 | |
| } | |
| } | |
| })[3] | |
| cumulate <- cumulate + timing | |
| } | |
| timing <- cumulate/runs | |
| times[4, 3] <- timing | |
| cat(c("Creation of a 500x500 Toeplitz matrix (loops)_______ (sec): ", timing, "\n")) | |
| remove("b", "j", "k") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| # (5) | |
| cumulate <- 0; p <- 0; vt <- 0; vr <- 0; vrt <- 0; rvt <- 0; RV <- 0; j <- 0; k <- 0; | |
| x2 <- 0; R <- 0; Rxx <- 0; Ryy <- 0; Rxy <- 0; Ryx <- 0; Rvmax <- 0 | |
| # Calculate the trace of a matrix (sum of its diagonal elements) | |
| Trace <- function(y) {sum(c(y)[1 + 0:(min(dim(y)) - 1) * (dim(y)[1] + 1)], na.rm=FALSE)} | |
| for (i in 1:runs) { | |
| x <- abs(Rnorm(45*45)); dim(x) <- c(45, 45) | |
| invisible(gc()) | |
| timing <- system.time({ | |
| # Calculation of Escoufier's equivalent vectors | |
| p <- ncol(x) | |
| vt <- 1:p # Variables to test | |
| vr <- NULL # Result: ordered variables | |
| RV <- 1:p # Result: correlations | |
| vrt <- NULL | |
| for (j in 1:p) { # loop on the variable number | |
| Rvmax <- 0 | |
| for (k in 1:(p-j+1)) { # loop on the variables | |
| x2 <- cbind(x, x[,vr], x[,vt[k]]) | |
| R <- cor(x2) # Correlations table | |
| Ryy <- R[1:p, 1:p] | |
| Rxx <- R[(p+1):(p+j), (p+1):(p+j)] | |
| Rxy <- R[(p+1):(p+j), 1:p] | |
| Ryx <- t(Rxy) | |
| rvt <- Trace(Ryx %*% Rxy) / sqrt(Trace(Ryy %*% Ryy) * Trace(Rxx %*% Rxx)) # RV calculation | |
| if (rvt > Rvmax) { | |
| Rvmax <- rvt # test of RV | |
| vrt <- vt[k] # temporary held variable | |
| } | |
| } | |
| vr[j] <- vrt # Result: variable | |
| RV[j] <- Rvmax # Result: correlation | |
| vt <- vt[vt!=vr[j]] # reidentify variables to test | |
| } | |
| })[3] | |
| cumulate <- cumulate + timing | |
| } | |
| times[5, 3] <- timing | |
| cat(c("Escoufier's method on a 45x45 matrix (mixed)________ (sec): ", timing, "\n")) | |
| remove("x", "p", "vt", "vr", "vrt", "rvt", "RV", "j", "k") | |
| remove("x2", "R", "Rxx", "Ryy", "Rxy", "Ryx", "Rvmax", "Trace") | |
| if (R.Version()$os == "Win32" || R.Version()$os == "mingw32") flush.console() | |
| times[ , 3] <- sort(times[ , 3]) | |
| cat(" --------------------------------------------\n") | |
| cat(c(" Trimmed geom. mean (2 extremes eliminated): ", exp(mean(log(times[2:4, 3]))), "\n\n\n")) | |
| cat(c("Total time for all 15 tests_________________________ (sec): ", sum(times), "\n")) | |
| cat(c("Overall mean (sum of I, II and III trimmed means/3)_ (sec): ", exp(mean(log(times[2:4, ]))), "\n")) | |
| remove("cumulate", "timing", "times", "runs", "i") | |
| cat(" --- End of test ---\n\n") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| suppressPackageStartupMessages({ | |
| require(tidymodels) | |
| }) | |
| ames_split <- initial_split(modeldata::ames, strata = Sale_Price) | |
| ames_train <- training(ames_split) | |
| ames_test <- testing(ames_split) | |
| ames_rec <- recipe(Sale_Price ~ ., data = ames_train) |> | |
| step_mutate(Sale_Price = log1p(Sale_Price)) |> | |
| step_nzv(all_predictors()) |> | |
| step_select( | |
| all_outcomes(), | |
| all_numeric_predictors(), | |
| !starts_with("Year"), !ends_with("tude") | |
| ) |> | |
| step_normalize(all_numeric_predictors()) | |
| # injection | |
| # https://parsnip.tidymodels.org/reference/model_spec.html | |
| num_trees <- 1200L | |
| ames_spec <- rand_forest(mtry = tune(), trees = !!num_trees) |> | |
| set_mode("regression") |> | |
| set_engine("ranger") |> | |
| set_args(num.threads = !!parallelly::availableCores(omit = 1), seed = 1234) | |
| str(ames_spec) | |
| base_wf <- | |
| workflow() |> | |
| add_recipe(ames_rec) |> | |
| add_model( | |
| rand_forest(trees = !!num_trees) |> | |
| set_engine("ranger") |> | |
| set_mode("regression") |> | |
| set_args(num.threads = !!parallelly::availableCores(omit = 1)), | |
| formula = expm1(Sale_Price) ~ . | |
| ) |> | |
| fit(ames_train) | |
| augment( | |
| extract_fit_parsnip(base_wf), | |
| new_data = prep(ames_rec) |> bake(new_data = ames_test) | |
| ) |> | |
| dplyr::mutate(Sale_Price = expm1(Sale_Price)) |> # Sale_Priceはlog1pされているので元に戻す | |
| metrics(truth = Sale_Price, estimate = .pred) | |
| ames_grid <- workflow() |> | |
| add_recipe(ames_rec) |> | |
| add_model(ames_spec, formula = Sale_Price ~ .) |> | |
| tune_grid( | |
| resamples = vfold_cv(ames_train, v = 3), | |
| grid = grid_latin_hypercube( | |
| extract_parameter_set_dials(ames_spec) |> | |
| finalize(prep(ames_rec) |> bake(new_data = NULL)), | |
| size = 10 | |
| ), | |
| control = control_grid(verbose = FALSE), | |
| metrics = metric_set(rmse) | |
| ) | |
| select_best(ames_grid) | |
| show_best(ames_grid) | |
| best_wf <- | |
| workflow() |> | |
| add_recipe(ames_rec) |> | |
| add_model(ames_spec, formula = expm1(Sale_Price) ~ .) |> | |
| finalize_workflow(select_best(ames_grid)) | |
| best_fit <- last_fit(best_wf, ames_split) | |
| collect_predictions(best_fit) | |
| collect_predictions(best_fit) |> | |
| dplyr::mutate(Sale_Price = expm1(Sale_Price)) |> # Sale_Priceはlog1pされているので元に戻す | |
| metrics(truth = Sale_Price, estimate = .pred) | |
| # model formula | |
| # https://github.com/tidymodels/parsnip/blob/8f13c1c41ce603261f25af64694c253f618fa999/R/model_formula.R | |
| linear_reg(penalty = 1.0) |> | |
| set_engine("glmnet") |> | |
| set_mode("regression") | |
| ### baritsu | |
| suppressPackageStartupMessages({ | |
| require(tidymodels) | |
| require(baritsu) | |
| }) | |
| #### classification | |
| data("penguins", package = "modeldata") | |
| data_split <- initial_split(penguins, strata = species, prop = .7) | |
| penguins_train <- training(data_split) | |
| rec <- | |
| recipe( | |
| species ~ ., | |
| data = penguins_train | |
| ) |> | |
| step_impute_mode(all_nominal()) |> | |
| step_impute_median(all_numeric_predictors()) |> | |
| step_zv(all_numeric_predictors()) |> | |
| step_scale(all_numeric_predictors()) |> | |
| step_dummy(all_nominal_predictors()) | |
| spec <- svm_linear( | |
| margin = tune() | |
| ) |> | |
| set_engine("baritsu", penalty = tune()) |> | |
| set_mode("classification") | |
| wf <- workflow() |> | |
| add_recipe(rec) |> | |
| add_model(spec) | |
| wf_fit <- wf |> | |
| tune_grid( | |
| resamples = vfold_cv(penguins_train, v = 5, strata = species), | |
| grid = grid_max_entropy( | |
| svm_margin(), | |
| penalty(range = c(-1, 0)), | |
| size = 5 | |
| ), | |
| metrics = metric_set(f_meas), | |
| control = control_grid(verbose = FALSE) | |
| ) | |
| show_best(wf_fit) | |
| best_wf_fit <- wf |> | |
| finalize_workflow(select_best(wf_fit)) |> | |
| last_fit(data_split, metrics = metric_set(f_meas)) | |
| collect_metrics(best_wf_fit) | |
| #### regression | |
| pkgload::load_all(export_all = FALSE) | |
| ames <- modeldata::ames |> | |
| dplyr::slice_sample(n = 100) |> | |
| dplyr::select(Sale_Price, Lot_Area, Total_Bsmt_SF, First_Flr_SF) | |
| data_split <- rsample::initial_split(ames) | |
| ames_train <- rsample::training(data_split) | |
| ames_test <- rsample::testing(data_split) | |
| rec <- | |
| recipes::recipe( | |
| Sale_Price ~ ., | |
| data = ames_train | |
| ) |> | |
| recipes::step_scale(recipes::all_numeric_predictors()) | |
| spec <- lars( | |
| penalty_L1 = tune::tune(), | |
| penalty_L2 = tune::tune() | |
| ) |> | |
| parsnip::set_engine("baritsu") |> | |
| parsnip::set_mode("regression") | |
| wf <- | |
| workflows::workflow() |> | |
| workflows::add_recipe(rec) |> | |
| workflows::add_model(spec) | |
| wf_fit <- wf |> | |
| tune::tune_grid( | |
| resamples = rsample::vfold_cv(ames_train, v = 5), | |
| grid = dials::grid_max_entropy( | |
| dials::penalty_L1(range = c(-10, 0)), | |
| dials::penalty_L2(range = c(-10, 0)), | |
| size = 10 | |
| ), | |
| metrics = yardstick::metric_set(yardstick::rmse), | |
| control = tune::control_grid(verbose = TRUE) | |
| ) | |
| penguins <- modeldata::penguins | |
| data_split <- rsample::initial_split(penguins, strata = "species") | |
| penguins_train <- rsample::training(data_split) | |
| penguins_test <- rsample::testing(data_split) | |
| rec <- | |
| recipes::recipe( | |
| species ~ ., | |
| data = penguins_train | |
| ) |> | |
| # recipes::step_select(recipes::all_outcomes(), recipes::all_numeric_predictors()) |> | |
| recipes::step_impute_mode(recipes::all_nominal()) |> | |
| recipes::step_impute_median(recipes::all_numeric_predictors()) |> | |
| recipes::step_scale(recipes::all_numeric_predictors()) | |
| my_dat <- recipes::prep(rec) |> recipes::bake(new_data = NULL) | |
| my_dat2 <- recipes::prep(rec) |> recipes::bake(new_data = penguins_test) | |
| wf_fit <- wf |> | |
| tune::tune_grid( | |
| resamples = rsample::vfold_cv(penguins_train, v = 3), | |
| grid = dials::grid_max_entropy( | |
| dials::penalty(), | |
| size = 5 | |
| ), | |
| metrics = yardstick::metric_set(yardstick::f_meas), | |
| control = tune::control_grid(verbose = TRUE) | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require(audio.whisper) | |
| model <- whisper("models/ggml-base.bin") | |
| trans <- predict(model, "output.wav", language = "ja") | |
| arrow::write_parquet(trans$tokens, "output.wav.parquet") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment