Skip to content

Instantly share code, notes, and snippets.

@Kairi
Last active August 29, 2015 14:23
Show Gist options
  • Select an option

  • Save Kairi/d52ccef5526deb189974 to your computer and use it in GitHub Desktop.

Select an option

Save Kairi/d52ccef5526deb189974 to your computer and use it in GitHub Desktop.
TPPS I/O Scheduler modify to run in Linux 4.x kernel. (tested in linux kernel 4.0-rc7 with QEMU)
/*
* TPPS, or Tiny Parallel Proportion disk Scheduler.
*
* Based on ideas from Zhu Yanhai <[email protected]>
*
* Copyright (C) 2013 Robin Dong <[email protected]>
*/
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/jiffies.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/blktrace_api.h>
#include "blk-cgroup.h"
#include "blk.h"
static struct kmem_cache *tpps_pool;
struct tpps_queue {
/* reference count */
int ref;
/* parent tpps_data */
struct tpps_data *tppd;
/* tpps_group member */
struct list_head tppg_node;
/* sorted list of pending requests */
struct list_head sort_list;
struct tpps_group *tppg;
pid_t pid;
int online;
int rq_queued;
};
struct tppg_stats {
/* total bytes transferred */
struct blkg_rwstat service_bytes;
/* total IOs serviced, post merge */
struct blkg_rwstat serviced;
/* number of ios merged */
struct blkg_rwstat merged;
/* total time spent on device in ns, may not be accurate w/ queueing */
struct blkg_rwstat service_time;
/* total time spent waiting in scheduler queue in ns */
struct blkg_rwstat wait_time;
/* number of IOs queued up */
struct blkg_rwstat queued;
/* total sectors transferred */
struct blkg_stat sectors;
/* total disk time and nr sectors dispatched by this group */
struct blkg_stat time;
};
struct tpps_group {
struct blkg_policy_data pd;
/* tpps_data member */
struct list_head tppd_node;
struct list_head *cur_dispatcher;
unsigned int weight;
unsigned int new_weight;
unsigned int dev_weight;
unsigned int leaf_weight;
unsigned int new_leaf_weight;
unsigned int dev_leaf_weight;
bool needs_update;
/*
* lists of queues with requests.
*/
struct list_head queue_list;
int nr_tppq;
int rq_queued;
int rq_in_driver;
struct tppg_stats stats; /* stats for this tppg */
struct tppg_stats dead_stats; /* stats pushed from dead children */
};
struct tpps_io_cq {
struct io_cq icq; /* must be the first member */
struct tpps_queue *tppq;
uint64_t blkcg_serial_nr; /* the current blkcg ID */
};
struct tpps_data {
struct request_queue *queue;
struct tpps_group *root_group;
/* List of tpps groups being managed on this device*/
struct list_head group_list;
unsigned int busy_queues;
int dispatched;
int rq_in_driver;
struct work_struct unplug_work;
/* Number of groups which are on blkcg->blkg_list */
unsigned int nr_blkcg_linked_grps;
unsigned total_weight;
};
static inline struct blkcg_gq *tppg_to_blkg(struct tpps_group *tppg)
{
return pd_to_blkg(&tppg->pd);
}
#define tpps_log_tppq(tppd, tppq, fmt, args...) do { \
char __pbuf[128]; \
\
blkg_path(tppg_to_blkg((tppq)->tppg), __pbuf, sizeof(__pbuf)); \
blk_add_trace_msg((tppd)->queue, "tpps%d %s " fmt, (tppq)->pid, \
__pbuf, ##args); \
} while (0)
#define tpps_log_tppg(tppd, tppg, fmt, args...) do { \
char __pbuf[128]; \
\
blkg_path(tppg_to_blkg(tppg), __pbuf, sizeof(__pbuf)); \
blk_add_trace_msg((tppd)->queue, "%s " fmt, __pbuf, ##args); \
} while (0)
#define tpps_log(tppd, fmt, args...) \
blk_add_trace_msg((tppd)->queue, "tpps " fmt, ##args)
static inline struct tpps_io_cq *icq_to_tic(struct io_cq *icq)
{
/* tic->icq is the first member, %NULL will convert to %NULL */
return container_of(icq, struct tpps_io_cq, icq);
}
#define RQ_TIC(rq) icq_to_tic((rq)->elv.icq)
#define RQ_TPPQ(rq) (struct tpps_queue *) ((rq)->elv.priv[0])
#define RQ_TPPG(rq) (struct tpps_group *) ((rq)->elv.priv[1])
#define TPPS_WEIGHT_DEFAULT (500)
#define MIN_DISPATCH_RQ (8)
static struct blkcg_policy blkcg_policy_tpps;
static inline struct tpps_group *pd_to_tppg(struct blkg_policy_data *pd)
{
return pd ? container_of(pd, struct tpps_group, pd) : NULL;
}
static inline struct tpps_group *blkg_to_tppg(struct blkcg_gq *blkg)
{
return pd_to_tppg(blkg_to_pd(blkg, &blkcg_policy_tpps));
}
static inline struct tpps_io_cq *
tpps_tic_lookup(struct tpps_data *tppd, struct io_context *ioc)
{
if (ioc)
return icq_to_tic(ioc_lookup_icq(ioc, tppd->queue));
return NULL;
}
static inline struct tpps_queue *tic_to_tppq(struct tpps_io_cq *tic)
{
return tic->tppq;
}
static inline void tic_set_tppq(struct tpps_io_cq *tic, struct tpps_queue *tppq)
{
tic->tppq = tppq;
}
static inline struct tpps_data *tic_to_tppd(struct tpps_io_cq *tic)
{
return tic->icq.q->elevator->elevator_data;
}
static inline void tppg_get(struct tpps_group *tppg)
{
return blkg_get(tppg_to_blkg(tppg));
}
static inline void tppg_put(struct tpps_group *tppg)
{
return blkg_put(tppg_to_blkg(tppg));
}
static inline void tppg_stats_update_io_add(struct tpps_group *tppg,
struct tpps_group *curr_tppg, int rw)
{
blkg_rwstat_add(&tppg->stats.queued, rw, 1);
}
static inline void tppg_stats_update_io_remove(struct tpps_group *tppg, int rw)
{
blkg_rwstat_add(&tppg->stats.queued, rw, -1);
}
static inline void tppg_stats_update_io_merged(struct tpps_group *tppg, int rw)
{
blkg_rwstat_add(&tppg->stats.merged, rw, 1);
}
static inline void tppg_stats_update_dispatch(struct tpps_group *tppg,
uint64_t bytes, int rw)
{
blkg_stat_add(&tppg->stats.sectors, bytes >> 9);
blkg_rwstat_add(&tppg->stats.serviced, rw, 1);
blkg_rwstat_add(&tppg->stats.service_bytes, rw, bytes);
}
static inline void tppg_stats_update_completion(struct tpps_group *tppg,
uint64_t start_time, uint64_t io_start_time, int rw)
{
struct tppg_stats *stats = &tppg->stats;
unsigned long long now = sched_clock();
if (time_after64(now, io_start_time))
blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
if (time_after64(io_start_time, start_time))
blkg_rwstat_add(&stats->wait_time, rw,
io_start_time - start_time);
}
static void tpps_del_queue(struct tpps_queue *tppq)
{
struct tpps_data *tppd = tppq->tppd;
struct tpps_group *tppg = tppq->tppg;
if (!list_empty(&tppq->tppg_node)) {
list_del_init(&tppq->tppg_node);
tpps_log_tppq(tppd, tppq, "del queue\n");
tppg->cur_dispatcher = NULL;
tppq->tppg = NULL;
}
printk("%p nr_tppq:%d\n", tppg, tppg->nr_tppq);
BUG_ON(tppg->nr_tppq < 1);
tppg->nr_tppq--;
if (!tppg->nr_tppq)
tppd->total_weight -= tppg->pd.blkg->blkcg->cfq_weight;
BUG_ON(!tppd->busy_queues);
tppd->busy_queues--;
}
/*
* task holds one reference to the queue, dropped when task exits. each rq
* in-flight on this queue also holds a reference, dropped when rq is freed.
*
* Each tpps queue took a reference on the parent group. Drop it now.
* queue lock must be held here.
*/
static void tpps_put_queue(struct tpps_queue *tppq)
{
struct tpps_data *tppd = tppq->tppd;
struct tpps_group *tppg;
BUG_ON(tppq->ref <= 0);
tppq->ref--;
if (tppq->ref)
return;
tpps_log_tppq(tppd, tppq, "put_queue");
BUG_ON(!list_empty(&tppq->sort_list));
tppg = tppq->tppg;
tpps_del_queue(tppq);
kmem_cache_free(tpps_pool, tppq);
tppg_put(tppg);
}
static void tpps_init_tppq(struct tpps_data *tppd, struct tpps_queue *tppq,
pid_t pid)
{
INIT_LIST_HEAD(&tppq->tppg_node);
INIT_LIST_HEAD(&tppq->sort_list);
tppq->ref = 0;
tppq->tppd = tppd;
tppq->pid = pid;
}
static void tpps_link_tppq_tppg(struct tpps_queue *tppq,
struct tpps_group *tppg)
{
tppq->tppg = tppg;
/* tppq reference on tppg */
tppg_get(tppg);
}
static struct tpps_group *tpps_lookup_create_tppg(struct tpps_data *tppd,
struct blkcg *blkcg)
{
struct request_queue *q = tppd->queue;
struct tpps_group *tppg = NULL;
/* avoid lookup for the common case where there's no blkcg */
if (blkcg == &blkcg_root) {
tppg = tppd->root_group;
} else {
struct blkcg_gq *blkg;
blkg = blkg_lookup_create(blkcg, q);
if (!IS_ERR(blkg))
tppg = blkg_to_tppg(blkg);
}
return tppg;
}
static struct tpps_queue *
tpps_find_alloc_queue(struct tpps_data *tppd, struct tpps_io_cq* tic, struct bio *bio,
gfp_t gfp_mask)
{
struct tpps_queue *tppq, *new_tppq = NULL;
struct tpps_group *tppg;
struct blkcg *blkcg;
retry:
rcu_read_lock();
blkcg = bio_blkcg(bio);
tppg = tpps_lookup_create_tppg(tppd, blkcg);
tppq = tic_to_tppq(tic);
if (!tppq) {
if (new_tppq) {
tppq = new_tppq;
new_tppq = NULL;
} else if (gfp_mask & __GFP_WAIT) {
rcu_read_unlock();
spin_unlock_irq(tppd->queue->queue_lock);
new_tppq = kmem_cache_alloc_node(tpps_pool,
gfp_mask | __GFP_ZERO,
tppd->queue->node);
spin_lock_irq(tppd->queue->queue_lock);
if (new_tppq)
goto retry;
} else
tppq = kmem_cache_alloc_node(tpps_pool,
gfp_mask | __GFP_ZERO,
tppd->queue->node);
if (tppq) {
tpps_init_tppq(tppd, tppq, current->pid);
tpps_link_tppq_tppg(tppq, tppg);
tpps_log_tppq(tppd, tppq, "alloced");
}
}
if (new_tppq)
kmem_cache_free(tpps_pool, new_tppq);
rcu_read_unlock();
return tppq;
}
static struct tpps_queue *
tpps_get_queue(struct tpps_data *tppd, struct tpps_io_cq *tic, struct bio *bio,
gfp_t gfp_mask)
{
struct tpps_queue *tppq;
tppq = tpps_find_alloc_queue(tppd, tic, bio, gfp_mask);
tppq->ref++;
return tppq;
}
/*
* scheduler run of queue, if there are requests pending and no one in the
* driver that will restart queueing
*/
static inline void tpps_schedule_dispatch(struct tpps_data *tppd)
{
if (tppd->busy_queues) {
tpps_log(tppd, "schedule dispatch");
kblockd_schedule_work(&tppd->unplug_work);
}
}
static void check_blkcg_changed(struct tpps_io_cq *tic, struct bio *bio)
{
struct tpps_data *tppd = tic_to_tppd(tic);
struct tpps_queue *tppq;
uint64_t serial_nr;
rcu_read_lock();
serial_nr = bio_blkcg(bio)->css.serial_nr;
rcu_read_unlock();
/*
* Check whether blkcg has changed. The condition may trigger
* spuriously on a newly created tic but there's no harm.
*/
if (unlikely(!tppd) || likely(tic->blkcg_serial_nr == serial_nr))
return;
tppq = tic_to_tppq(tic);
if (tppq) {
/*
* Drop reference to sync queue. A new sync queue will be
* assigned in new group upon arrival of a fresh request.
*/
tpps_log_tppq(tppd, tppq, "changed cgroup");
tic_set_tppq(tic, NULL);
tpps_put_queue(tppq);
}
tic->blkcg_serial_nr = serial_nr;
}
static int
tpps_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
gfp_t gfp_mask)
{
struct tpps_data *tppd = q->elevator->elevator_data;
struct tpps_io_cq *tic = icq_to_tic(rq->elv.icq);
struct tpps_queue *tppq;
might_sleep_if(gfp_mask & __GFP_WAIT);
spin_lock_irq(q->queue_lock);
check_blkcg_changed(tic, bio);
tppq = tic_to_tppq(tic);
if (!tppq) {
tppq = tpps_get_queue(tppd, tic, bio, gfp_mask);
tic_set_tppq(tic, tppq);
}
tppq->ref++;
tppg_get(tppq->tppg);
rq->elv.priv[0] = tppq;
rq->elv.priv[1] = tppq->tppg;
spin_unlock_irq(q->queue_lock);
return 0;
}
/*
* queue lock held here
*/
static void tpps_put_request(struct request *rq)
{
struct tpps_queue *tppq = RQ_TPPQ(rq);
if (tppq) {
WARN_ON(tppq->tppg != RQ_TPPG(rq));
/* Put down rq reference on cfqg */
tppg_put(RQ_TPPG(rq));
rq->elv.priv[0] = NULL;
rq->elv.priv[1] = NULL;
tpps_put_queue(tppq);
}
}
static void
tpps_update_group_weight(struct tpps_group *tppg)
{
if (tppg->needs_update) {
tppg->weight = tppg->new_weight;
tppg->needs_update = false;
}
}
static void tpps_add_queue(struct tpps_data *tppd, struct tpps_queue *tppq)
{
struct tpps_group *tppg;
if (!tppq->online) {
tppq->online = 1;
tppg = tppq->tppg;
tpps_log_tppq(tppd, tppq, "add queue");
tppg->nr_tppq++;
tppd->busy_queues++;
list_add(&tppq->tppg_node, &tppg->queue_list);
printk("add tppq %p to %p\n", tppq, tppg);
tpps_update_group_weight(tppg);
if (tppg->nr_tppq <= 1) {
tppd->total_weight += tppg->pd.blkg->blkcg->cfq_weight;
list_add(&tppg->tppd_node, &tppd->group_list);
printk("twt:%u, wt:%u %u %d %p\n", tppd->total_weight, tppg->weight,
tppg->pd.blkg->blkcg->cfq_weight,
tppg->nr_tppq,
tppg);
}
}
}
static void tpps_insert_request(struct request_queue *q, struct request *rq)
{
struct tpps_data *tppd = q->elevator->elevator_data;
struct tpps_queue *tppq = RQ_TPPQ(rq);
tpps_log_tppq(tppd, tppq, "insert_request");
list_add_tail(&rq->queuelist, &tppq->sort_list);
tppq->rq_queued++;
tppq->tppg->rq_queued++;
tppd->dispatched++;
tpps_add_queue(tppd, tppq);
tppg_stats_update_io_add(RQ_TPPG(rq), tppq->tppg, rq->cmd_flags);
}
static void tpps_remove_request(struct request *rq)
{
struct tpps_queue *tppq = RQ_TPPQ(rq);
list_del_init(&rq->queuelist);
tppq->rq_queued--;
tppq->tppg->rq_queued--;
tppg_stats_update_io_remove(RQ_TPPG(rq), rq->cmd_flags);
}
/*
* Move request from internal lists to the request queue dispatch list.
*/
static int tpps_dispatch_insert(struct request_queue *q,
struct tpps_queue *tppq)
{
struct list_head *rbnext = tppq->sort_list.next;
struct request *rq;
if (rbnext == &tppq->sort_list)
return 0;
rq = rq_entry_fifo(rbnext);
tpps_remove_request(rq);
elv_dispatch_sort(q, rq);
tppg_stats_update_dispatch(tppq->tppg, blk_rq_bytes(rq), rq->cmd_flags);
return 1;
}
static int tpps_dispatch_requests_nr(struct tpps_data *tppd,
struct tpps_queue *tppq, int count)
{
int cnt = 0, ret;
if (!tppq->rq_queued)
return cnt;
do {
ret = tpps_dispatch_insert(tppd->queue, tppq);
if (ret) {
cnt++;
tppd->dispatched--;
}
} while (ret && cnt < count);
return cnt;
}
static int tpps_dispatch_requests(struct request_queue *q, int force)
{
struct tpps_data *tppd = q->elevator->elevator_data;
struct tpps_group *tppg, *group_n;
struct tpps_queue *tppq;
struct list_head *next;
int count = 0, total = 0, ret;
int quota, grp_quota;
if (!tppd->total_weight)
return 0;
quota = q->nr_requests - tppd->rq_in_driver;
if (quota < MIN_DISPATCH_RQ && !force)
return 0;
list_for_each_entry_safe(tppg, group_n, &tppd->group_list, tppd_node) {
if (!tppg->nr_tppq)
continue;
grp_quota = (quota * tppg->pd.blkg->blkcg->cfq_weight
/ tppd->total_weight) - tppg->rq_in_driver;
tpps_log_tppg(tppd, tppg,
"nr:%d, wt:%u total_wt:%u in_driver:%d %d quota:%d grp_quota:%d",
tppg->nr_tppq, tppg->pd.blkg->blkcg->cfq_weight,
tppd->total_weight, tppg->rq_in_driver, tppg->rq_queued,
quota, grp_quota);
if (grp_quota <= 0 && !force)
continue;
BUG_ON(tppg->queue_list.next == &tppg->queue_list);
if (!tppg->cur_dispatcher)
tppg->cur_dispatcher = tppg->queue_list.next;
next = tppg->cur_dispatcher;
count = 0;
do {
tppq = list_entry(next, struct tpps_queue, tppg_node);
tpps_log_tppq(tppd, tppq, "tppq: %d\n", tppq->rq_queued);
if (force)
ret = tpps_dispatch_requests_nr(tppd, tppq, -1);
else
ret = tpps_dispatch_requests_nr(tppd, tppq, 1);
count += ret;
total += ret;
next = next->next;
if (next == &tppg->queue_list)
next = tppg->queue_list.next;
if (count >= grp_quota && !force) {
tppg->cur_dispatcher = next;
break;
}
BUG_ON(tppg->cur_dispatcher == &tppg->queue_list);
} while (next != tppg->cur_dispatcher);
}
return total > 0;
}
static void tpps_kick_queue(struct work_struct *work)
{
struct tpps_data *tppd =
container_of(work, struct tpps_data, unplug_work);
struct request_queue *q = tppd->queue;
spin_lock_irq(q->queue_lock);
__blk_run_queue(q);
spin_unlock_irq(q->queue_lock);
}
static void tpps_init_tppg_base(struct tpps_group *tppg)
{
INIT_LIST_HEAD(&tppg->tppd_node);
INIT_LIST_HEAD(&tppg->queue_list);
tppg->cur_dispatcher = NULL;
}
static int tpps_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct tpps_data *tppd;
struct tpps_group *tppg;
int ret;
struct elevator_queue *eq;
eq = elevator_alloc(q, e);
if(!eq)
return -ENOMEM;
tppd = kzalloc_node(sizeof(*tppd), GFP_KERNEL, q->node);
if (!tppd) {
kobject_put(&eq->kobj);
return -ENOMEM;
}
eq->elevator_data = tppd;
tppd->queue = q;
spin_lock_irq(q->queue_lock);
q->elevator = eq;
spin_unlock_irq(q->queue_lock);
INIT_LIST_HEAD(&tppd->group_list);
ret = blkcg_activate_policy(q, &blkcg_policy_tpps);
if (ret)
goto out_free;
/* Init root group */
tppd->root_group = blkg_to_tppg(q->root_blkg);
tppg = tppd->root_group;
tpps_init_tppg_base(tppg);
/* Give preference to root group over other groups */
tppg->weight = 2 * TPPS_WEIGHT_DEFAULT;
tppg->leaf_weight = 2 * TPPS_WEIGHT_DEFAULT;
INIT_WORK(&tppd->unplug_work, tpps_kick_queue);
return 0;
out_free:
kfree(tppd);
kobject_put(&eq->kobj);
return ret;
}
static void tpps_exit_queue(struct elevator_queue *e)
{
struct tpps_data *tppd = e->elevator_data;
struct request_queue *q = tppd->queue;
cancel_work_sync(&tppd->unplug_work);
blkcg_deactivate_policy(q, &blkcg_policy_tpps);
kfree(tppd->root_group);
kfree(tppd);
}
static void tpps_activate_request(struct request_queue *q, struct request *rq)
{
struct tpps_queue *tppq = RQ_TPPQ(rq);
struct tpps_data *tppd = q->elevator->elevator_data;
tppd->rq_in_driver++;
tppq->tppg->rq_in_driver++;
tpps_log_tppq(tppd, RQ_TPPQ(rq), "activate rq, drv=%d",
tppd->rq_in_driver);
}
static void tpps_deactivate_request(struct request_queue *q, struct request *rq)
{
struct tpps_queue *tppq = RQ_TPPQ(rq);
struct tpps_data *tppd = q->elevator->elevator_data;
WARN_ON(!tppd->rq_in_driver);
tppd->rq_in_driver--;
tppq->tppg->rq_in_driver--;
tpps_log_tppq(tppd, RQ_TPPQ(rq), "deactivate rq, drv=%d",
tppd->rq_in_driver);
}
static void tpps_completed_request(struct request_queue *q, struct request *rq)
{
struct tpps_queue *tppq = RQ_TPPQ(rq);
struct tpps_data *tppd = tppq->tppd;
WARN_ON(!tppq);
WARN_ON(tppq->tppg != RQ_TPPG(rq));
tpps_log_tppq(tppd, tppq, "complete rqnoidle %d",
!!(rq->cmd_flags & REQ_NOIDLE));
WARN_ON(!tppd->rq_in_driver);
tppd->rq_in_driver--;
tppq->tppg->rq_in_driver--;
tppg_stats_update_completion(tppq->tppg,
rq_start_time_ns(rq), rq_io_start_time_ns(rq), rq->cmd_flags);
if (!tppd->rq_in_driver)
tpps_schedule_dispatch(tppd);
}
static void
tpps_merged_request(struct request_queue *q, struct request *rq, int type)
{
if (type == ELEVATOR_FRONT_MERGE) {
struct tpps_queue *tppq = RQ_TPPQ(rq);
list_del_init(&rq->queuelist);
tppq->rq_queued--;
tppg_stats_update_io_remove(RQ_TPPG(rq), rq->cmd_flags);
list_add_tail(&rq->queuelist, &tppq->sort_list);
tppq->rq_queued++;
tppg_stats_update_io_add(RQ_TPPG(rq), tppq->tppg, rq->cmd_flags);
}
}
static void
tpps_merged_requests(struct request_queue *q, struct request *rq,
struct request *next)
{
tpps_remove_request(next);
tppg_stats_update_io_merged(RQ_TPPG(rq), rq->cmd_flags);
}
static void tpps_init_icq(struct io_cq *icq)
{ }
static void tpps_exit_icq(struct io_cq *icq)
{
struct tpps_io_cq *tic = icq_to_tic(icq);
if (tic->tppq) {
tpps_put_queue(tic->tppq);
tic->tppq = NULL;
}
}
static struct elevator_type iosched_tpps = {
.ops = {
.elevator_merged_fn = tpps_merged_request,
.elevator_merge_req_fn = tpps_merged_requests,
.elevator_dispatch_fn = tpps_dispatch_requests,
.elevator_add_req_fn = tpps_insert_request,
.elevator_activate_req_fn = tpps_activate_request,
.elevator_deactivate_req_fn = tpps_deactivate_request,
.elevator_completed_req_fn = tpps_completed_request,
.elevator_init_icq_fn = tpps_init_icq,
.elevator_exit_icq_fn = tpps_exit_icq,
.elevator_set_req_fn = tpps_set_request,
.elevator_put_req_fn = tpps_put_request,
.elevator_init_fn = tpps_init_queue,
.elevator_exit_fn = tpps_exit_queue,
},
.icq_size = sizeof(struct tpps_io_cq),
.icq_align = __alignof__(struct tpps_io_cq),
.elevator_name = "tpps",
.elevator_owner = THIS_MODULE,
};
static u64 tppg_prfill_weight_device(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
struct tpps_group *tppg = pd_to_tppg(pd);
if (!tppg->dev_weight)
return 0;
return __blkg_prfill_u64(sf, pd, tppg->dev_weight);
}
static int tppg_print_weight_device(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
tppg_prfill_weight_device, &blkcg_policy_tpps,
0, false);
return 0;
}
static u64 tppg_prfill_leaf_weight_device(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
struct tpps_group *tppg = pd_to_tppg(pd);
if (!tppg->dev_leaf_weight)
return 0;
return __blkg_prfill_u64(sf, pd, tppg->dev_leaf_weight);
}
static int tppg_print_leaf_weight_device(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
tppg_prfill_leaf_weight_device, &blkcg_policy_tpps,
0, false);
return 0;
}
static int tppg_print_weight(struct seq_file *sf, void *v)
{
seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);
return 0;
}
static int tppg_print_leaf_weight(struct seq_file *sf, void *v)
{
seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);
return 0;
}
static ssize_t __tppg_set_weight_device(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off,
bool is_leaf_weight)
{
struct blkcg *blkcg = css_to_blkcg(of_css(of));
struct blkg_conf_ctx ctx;
struct tpps_group *tppg;
int ret;
ret = blkg_conf_prep(blkcg, &blkcg_policy_tpps, buf, &ctx);
if (ret)
return ret;
ret = -EINVAL;
tppg = blkg_to_tppg(ctx.blkg);
if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
if (!is_leaf_weight) {
tppg->dev_weight = ctx.v;
tppg->new_weight = ctx.v ?: blkcg->cfq_weight;
} else {
tppg->dev_leaf_weight = ctx.v;
tppg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
}
ret = 0;
}
blkg_conf_finish(&ctx);
return ret;
}
static ssize_t tppg_set_weight_device(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
return __tppg_set_weight_device(of, buf, nbytes, off, false);
}
static ssize_t tppg_set_leaf_weight_device(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
return __tppg_set_weight_device(of, buf, nbytes, off, true);
}
static int __tpps_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
u64 val, bool is_leaf_weight)
{
// struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
struct blkcg *blkcg = css_to_blkcg(css);
struct blkcg_gq *blkg;
if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
return -EINVAL;
spin_lock_irq(&blkcg->lock);
if (!is_leaf_weight)
blkcg->cfq_weight = val;
else
blkcg->cfq_leaf_weight = val;
hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
struct tpps_group *tppg = blkg_to_tppg(blkg);
if (!tppg)
continue;
if (!is_leaf_weight) {
if (!tppg->dev_weight)
tppg->new_weight = blkcg->cfq_weight;
} else {
if (!tppg->dev_leaf_weight)
tppg->new_leaf_weight = blkcg->cfq_leaf_weight;
}
}
spin_unlock_irq(&blkcg->lock);
return 0;
}
static int tpps_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
u64 val)
{
return __tpps_set_weight(css, cft, val, false);
}
static int tpps_set_leaf_weight(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
return __tpps_set_weight(css, cft, val, true);
}
/* offset delta from tppg->stats to tppg->dead_stats */
static const int dead_stats_off_delta = offsetof(struct tpps_group, dead_stats) -
offsetof(struct tpps_group, stats);
/* to be used by recursive prfill, sums live and dead rwstats recursively */
static struct blkg_rwstat tppg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
int off)
{
struct blkg_rwstat a, b;
a = blkg_rwstat_recursive_sum(pd, off);
b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
blkg_rwstat_merge(&a, &b);
return a;
}
/* to be used by recursive prfill, sums live and dead stats recursively */
static u64 tppg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
{
u64 sum = 0;
sum += blkg_stat_recursive_sum(pd, off);
sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
return sum;
}
static int tppg_print_stat(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
&blkcg_policy_tpps, seq_cft(sf)->private, false);
return 0;
}
static int tppg_print_rwstat(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
&blkcg_policy_tpps, seq_cft(sf)->private, true);
return 0;
}
static u64 tppg_prfill_stat_recursive(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
u64 sum = tppg_stat_pd_recursive_sum(pd, off);
return __blkg_prfill_u64(sf, pd, sum);
}
static u64 tppg_prfill_rwstat_recursive(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
struct blkg_rwstat sum = tppg_rwstat_pd_recursive_sum(pd, off);
return __blkg_prfill_rwstat(sf, pd, &sum);
}
static int tppg_print_stat_recursive(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
tppg_prfill_stat_recursive, &blkcg_policy_tpps,
seq_cft(sf)->private, false);
return 0;
}
static int tppg_print_rwstat_recursive(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
tppg_prfill_rwstat_recursive, &blkcg_policy_tpps,
seq_cft(sf)->private, true);
return 0;
}
static struct cftype tpps_blkcg_files[] = {
/* on root, weight is mapped to leaf_weight */
{
.name = "tpps.weight_device",
.flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = tppg_print_leaf_weight_device,
.write = tppg_set_leaf_weight_device,
.max_write_len = 256,
},
{
.name = "tpps.weight",
.flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = tppg_print_leaf_weight,
.write_u64 = tpps_set_leaf_weight,
},
/* no such mapping necessary for !roots */
{
.name = "tpps.weight_device",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = tppg_print_weight_device,
.write = tppg_set_weight_device,
.max_write_len = 256,
},
{
.name = "tpps.weight",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = tppg_print_weight,
.write_u64 = tpps_set_weight,
},
{
.name = "tpps.leaf_weight_device",
.seq_show = tppg_print_leaf_weight_device,
.write = tppg_set_leaf_weight_device,
.max_write_len = 256,
},
{
.name = "tpps.leaf_weight",
.seq_show = tppg_print_leaf_weight,
.write_u64 = tpps_set_leaf_weight,
},
/* statistics, covers only the tasks in the tppg */
{
.name = "tpps.time",
.private = offsetof(struct tpps_group, stats.time),
.seq_show = tppg_print_stat,
},
{
.name = "tpps.sectors",
.private = offsetof(struct tpps_group, stats.sectors),
.seq_show = tppg_print_stat,
},
{
.name = "tpps.io_service_bytes",
.private = offsetof(struct tpps_group, stats.service_bytes),
.seq_show = tppg_print_rwstat,
},
{
.name = "tpps.io_serviced",
.private = offsetof(struct tpps_group, stats.serviced),
.seq_show= tppg_print_rwstat,
},
{
.name = "tpps.io_service_time",
.private = offsetof(struct tpps_group, stats.service_time),
.seq_show = tppg_print_rwstat,
},
{
.name = "tpps.io_wait_time",
.private = offsetof(struct tpps_group, stats.wait_time),
.seq_show = tppg_print_rwstat,
},
{
.name = "tpps.io_merged",
.private = offsetof(struct tpps_group, stats.merged),
.seq_show = tppg_print_rwstat,
},
{
.name = "tpps.io_queued",
.private = offsetof(struct tpps_group, stats.queued),
.seq_show = tppg_print_rwstat,
},
/* the same statictics which cover the tppg and its descendants */
{
.name = "tpps.time_recursive",
.private = offsetof(struct tpps_group, stats.time),
.seq_show = tppg_print_stat_recursive,
},
{
.name = "tpps.sectors_recursive",
.private = offsetof(struct tpps_group, stats.sectors),
.seq_show = tppg_print_stat_recursive,
},
{
.name = "tpps.io_service_bytes_recursive",
.private = offsetof(struct tpps_group, stats.service_bytes),
.seq_show = tppg_print_rwstat_recursive,
},
{
.name = "tpps.io_serviced_recursive",
.private = offsetof(struct tpps_group, stats.serviced),
.seq_show = tppg_print_rwstat_recursive,
},
{
.name = "tpps.io_service_time_recursive",
.private = offsetof(struct tpps_group, stats.service_time),
.seq_show = tppg_print_rwstat_recursive,
},
{
.name = "tpps.io_wait_time_recursive",
.private = offsetof(struct tpps_group, stats.wait_time),
.seq_show = tppg_print_rwstat_recursive,
},
{
.name = "tpps.io_merged_recursive",
.private = offsetof(struct tpps_group, stats.merged),
.seq_show = tppg_print_rwstat_recursive,
},
{
.name = "tpps.io_queued_recursive",
.private = offsetof(struct tpps_group, stats.queued),
.seq_show = tppg_print_rwstat_recursive,
},
{ } /* terminate */
};
static void tpps_pd_init(struct blkcg_gq *blkg)
{
struct tpps_group *tppg = blkg_to_tppg(blkg);
tpps_init_tppg_base(tppg);
tppg->weight = blkg->blkcg->cfq_weight;
tppg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
}
static inline struct tpps_group *tppg_parent(struct tpps_group *tppg)
{
struct blkcg_gq *pblkg = tppg_to_blkg(tppg)->parent;
return pblkg ? blkg_to_tppg(pblkg) : NULL;
}
static void tppg_stats_reset(struct tppg_stats *stats)
{
/* queued stats shouldn't be cleared */
blkg_rwstat_reset(&stats->service_bytes);
blkg_rwstat_reset(&stats->serviced);
blkg_rwstat_reset(&stats->merged);
blkg_rwstat_reset(&stats->service_time);
blkg_rwstat_reset(&stats->wait_time);
blkg_stat_reset(&stats->time);
#ifdef CONFIG_DEBUG_BLK_CGROUP
blkg_stat_reset(&stats->unaccounted_time);
blkg_stat_reset(&stats->avg_queue_size_sum);
blkg_stat_reset(&stats->avg_queue_size_samples);
blkg_stat_reset(&stats->dequeue);
blkg_stat_reset(&stats->group_wait_time);
blkg_stat_reset(&stats->idle_time);
blkg_stat_reset(&stats->empty_time);
#endif
}
/* @to += @from */
static void tppg_stats_merge(struct tppg_stats *to, struct tppg_stats *from)
{
/* queued stats shouldn't be cleared */
blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
blkg_rwstat_merge(&to->serviced, &from->serviced);
blkg_rwstat_merge(&to->merged, &from->merged);
blkg_rwstat_merge(&to->service_time, &from->service_time);
blkg_rwstat_merge(&to->wait_time, &from->wait_time);
blkg_stat_merge(&from->time, &from->time);
#ifdef CONFIG_DEBUG_BLK_CGROUP
blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
blkg_stat_merge(&to->dequeue, &from->dequeue);
blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
blkg_stat_merge(&to->idle_time, &from->idle_time);
blkg_stat_merge(&to->empty_time, &from->empty_time);
#endif
}
static void tppg_stats_xfer_dead(struct tpps_group *tppg)
{
struct tpps_group *parent = tppg_parent(tppg);
lockdep_assert_held(tppg_to_blkg(tppg)->q->queue_lock);
if (unlikely(!parent))
return;
tppg_stats_merge(&parent->dead_stats, &tppg->stats);
tppg_stats_merge(&parent->dead_stats, &tppg->dead_stats);
tppg_stats_reset(&tppg->stats);
tppg_stats_reset(&tppg->dead_stats);
}
static void tpps_pd_offline(struct blkcg_gq *blkg)
{
struct tpps_group *tppg = blkg_to_tppg(blkg);
/*
* @blkg is going offline and will be ignored by
* blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
* that they don't get lost. If IOs complete after this point, the
* stats for them will be lost. Oh well...
*/
tppg_stats_xfer_dead(tppg);
if (!list_empty(&tppg->tppd_node))
list_del_init(&tppg->tppd_node);
//BUG_ON(!list_empty(&(tppg->queue_list)));
}
static void tpps_pd_reset_stats(struct blkcg_gq *blkg)
{
struct tpps_group *tppg = blkg_to_tppg(blkg);
tppg_stats_reset(&tppg->stats);
tppg_stats_reset(&tppg->dead_stats);
}
static struct blkcg_policy blkcg_policy_tpps = {
.pd_size = sizeof(struct tpps_group),
.cftypes = tpps_blkcg_files,
.pd_init_fn = tpps_pd_init,
.pd_offline_fn = tpps_pd_offline,
.pd_reset_stats_fn = tpps_pd_reset_stats,
};
// MEMO:reference to cfq-iosched.c __init
static int __init tpps_init(void)
{
int ret;
ret = blkcg_policy_register(&blkcg_policy_tpps);
if (ret)
return ret;
ret = -ENOMEM;
tpps_pool = KMEM_CACHE(tpps_queue, 0);
if (!tpps_pool)
goto err_pol_unreg;
ret = elv_register(&iosched_tpps);
if (ret)
goto err_free_pool;
return 0;
err_free_pool:
kmem_cache_destroy(tpps_pool);
err_pol_unreg:
blkcg_policy_unregister(&blkcg_policy_tpps);
return ret;
}
static void __exit tpps_exit(void)
{
blkcg_policy_unregister(&blkcg_policy_tpps);
elv_unregister(&iosched_tpps);
kmem_cache_destroy(tpps_pool);
}
module_init(tpps_init);
module_exit(tpps_exit);
MODULE_AUTHOR("Robin Dong");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Tiny Parallel Proportion io Scheduler");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment